Merge pull request #11 from PaddlePaddle/develop

merge to local

Merge pull request #11 from PaddlePaddle/develop
merge to local
e89406a3 · lujun · GitHub · a3f7ebd6 · 2579ade4 · e89406a3
447 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -42,12 +42,6 @@ repos:
        entry: bash ./tools/codestyle/pylint_pre_commit.hook
        language: system
        files: \.(py)$
-   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
-    hooks:
-    -   id: go-fmt
-        types:
-        - go
 -   repo: local
    hooks:
    -   id: copyright_checker

--- a/Dockerfile
+++ b/Dockerfile
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
+
+RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
+    tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr


--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce
-    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op

    avg_loss = train_args[0]


--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -31,9 +31,17 @@ IF(APPLE)
    return()
 ENDIF()

-MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+INCLUDE(GNUInstallDirs)
+SET(LIBDIR "lib")
+if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
+  SET(LIBDIR "lib64")
+endif()
+
+MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")

 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.

@@ -58,7 +66,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
-    GIT_TAG             "830a10059a018cd2634d94195140cf2d8790a75a"
+    GIT_TAG             "863ff6e7042cec7d2e29897fe9f0872e0888b0fc"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -79,9 +87,9 @@ ExternalProject_Add(
                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 if(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
 else(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 endif(WIN32)

 ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
@@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
 if(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
+    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
 else(WIN32)
    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -39,8 +39,10 @@ IF(WIN32)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-ELSE()  
-    SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
+ELSE()
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)

--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)

 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
+SET(NGRAPH_GIT_TAG         "a444f7a959b7d87f2c117c9b57a4c387759e481e")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
@@ -69,7 +69,7 @@ ExternalProject_Add(
    CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
-    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
+    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
 )


--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -153,7 +153,11 @@ function(op_library TARGET)
    # pybind USE_OP_DEVICE_KERNEL for CUDNN
    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
+      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+      endif()
    endif()

    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
@@ -168,6 +172,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
+        
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()

--- a/paddle/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -5,13 +5,13 @@ Kexin Zhao <zhaokexin01@baidu.com>
 ## Introduction
 Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data.  The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32).  Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).

-This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
+This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.


 ## What is float16?
 float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.

-Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
+Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.

 ## Why float16?
 The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
@@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model,

 ## Fluid implementation of float16 inference
 ### Overview
-Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
+Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.

 ### Basic requirement
 When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.

-If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. 
+If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.

 The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.

@@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate

 We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.

-The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
+The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).

 ### Experiment results
 Simply running the following commands to reproduce the experiment results presented in this section:
@@ -113,7 +113,7 @@ We repeat the test ten times and get the following results:
 | #10    | 62.53%  | 62.48%   |
 | average| 62.63%  | 62.62%   |

-We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. 
+We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.

 #### Performance benchmark
 Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. 
@@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data
 |float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
 |Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |

-We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. 
+We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.

 Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:

@@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a

 We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.

-Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
+Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results.

 ### Summary
 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -38,10 +38,10 @@ if(WITH_GPU)
    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
    add_dependencies(tensor tensor_util)
  else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
  endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
 endif()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -174,7 +174,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-target_link_libraries(executor garbage_collector)
+target_link_libraries(executor garbage_collector while_op_helper)

 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor

--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/block_desc.h"
+
 #include <queue>
+#include <unordered_set>
+#include <utility>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"

@@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
  ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }

+void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) {
+  // TODO(minqiyang): make this faster
+  for (auto it = ops_.begin(); it != ops_.end(); ++it) {
+    if (it->get() == op_desc) {
+      ops_.erase(it);
+      break;
+    }
+  }
+}
+
 std::vector<OpDesc *> BlockDesc::AllOps() const {
  std::vector<OpDesc *> res;
  for (const auto &op : ops_) {

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -93,6 +93,8 @@ class BlockDesc {
   */
  void RemoveOp(size_t s, size_t e);

+  void RemoveOpInternal(const OpDesc *op_desc);
+
  void RemoveVar(const std::string &name) { vars_.erase(name); }

  std::vector<OpDesc *> AllOps() const;

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  out_layout =
      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;

-  auto& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
-      pool.Get(expected_kernel_type.place_));
-  auto& cpu_engine = dev_ctx->GetEngine();
-
  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
  std::vector<int> out_tz = in_tz;

@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: %s", in.type());
  memory::data_type out_type = in_type;

-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
-  auto out_format =
-      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());

-  if (in_format != out_format) {
+  // tempory mem pd fr out , to make reorder
+  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+      paddle::framework::vectorize2int(out->dims()),
+      mkldnn::memory::format::blocked, out_type);
+  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
    void* in_data = GetDataFromTensor(in, in_type);
    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());

-    auto in_memory =
-        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-    auto out_memory =
-        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
+    auto out_memory = memory(out_mem_pd, out_data);

    platform::Reorder(in_memory, out_memory);
  } else {
    out->ShareDataWith(in);
  }
  out->set_layout(out_layout);
-  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
-  out->set_format(memory::format::format_undef);
 #endif
 }


--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
-
-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
-                                                        ToMKLDNNFormat(lin));
-
        out.ShareDataWith(input_tensor);
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
+        // TODO(jczaja): Remove that once all mkldnn ops
+        // are modified to work with mkldnn_blocked
+        auto mkldnn_fmt = [&](int rank) {
+          switch (rank) {
+            case 5:
+              return mkldnn::memory::format::ncdhw;
+            case 4:
+              return mkldnn::memory::format::nchw;
+            case 3:
+              return mkldnn::memory::format::ncw;
+            case 2:
+              return mkldnn::memory::format::nc;
+            case 1:
+              return mkldnn::memory::format::x;
+            default:
+              return mkldnn::memory::format::blocked;
+          }
+        };
+
+        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
+            paddle::framework::vectorize2int(out.dims()),
+            mkldnn_fmt(out.dims().size()));
+
+        out.set_mkldnn_prim_desc(out_mem_pd);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
 cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  std::unordered_map<std::string, int> vars;
  // TODO(gongwb): use graph topology sort to find the order of operators.
  //               Note that must assert topology sort is stable
-  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
    auto outputs = op_desc->Outputs();
    for (auto& o_it : outputs) {
@@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(

 REGISTER_PASS(all_reduce_deps_pass,
              paddle::framework::details::AllReduceDepsPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <glog/logging.h>
 #include <memory>
+#include <utility>

 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("sequential_execution_pass");
    }

+    // Add op fusion.
+    if (strategy.sync_batch_norm_) {
+      AppendPass("sync_batch_norm_pass");
+    }
+
    // Add op fusion.
    if (strategy.fuse_relu_depthwise_conv_) {
      AppendPass("fuse_relu_depthwise_conv_pass");
@@ -174,7 +180,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }

 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
+    const std::vector<platform::Place> &places,
    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -185,7 +192,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);

-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
@@ -203,41 +209,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
-    } else if (pass->Type() == "memory_optimize_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      const std::vector<OpDesc *> *all_op_descs =
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
-      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
-                                              all_op_descs);  // take ownership
-
-      pass->Erase(kAllOpDescs);
-      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
-
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                << ", num_trainers:" << num_trainers_;
-
-      pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
-    } else if (pass->Type() == "inplace_pass") {
-      if (graph->Has(kAllOpDescs)) {
-        graph->Erase(kAllOpDescs);
-      }
-      graph->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
      if (!use_cuda) {
        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
@@ -256,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle

+USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <vector>

@@ -76,11 +77,13 @@ struct BuildStrategy {

  bool fuse_relu_depthwise_conv_{false};

-  bool memory_optimize_{false};
+  bool sync_batch_norm_{false};
+
+  bool memory_optimize_{true};
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
  // memory_early_delete_ true by default
-  bool enable_inplace_{false};
+  bool enable_inplace_{true};

  bool enable_sequential_execution_{false};

@@ -114,7 +117,7 @@ struct BuildStrategy {

  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                   const std::vector<platform::Place> &places,
                                   const std::string &loss_var_name,
                                   const std::vector<Scope *> &local_scopes,

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <memory>
 #include <string>
 #include <vector>

@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
                      size_t scope_idx);

+  OperatorBase *GetOp() { return op_.get(); }
+
  std::string Name() const override;

  const Scope *GetScope() const { return scope_; }

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -12,6 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
+#include <unordered_set>
+#include <utility>
+
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
    }
  }
 #endif
+  PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
 }

 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }

 void EagerDeletionOpHandle::RunImpl() {
-  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  Scope *exec_scope = nullptr;
  std::deque<std::shared_ptr<memory::Allocation>> garbages;
  for (auto &name : var_names_) {
    auto it = ref_cnts_->find(name);
-    // Var not found, not reference count has not decreased to 0
+    // Reference count has not decreased to 0
    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
      continue;
    }

+    if (!exec_scope) {
+      exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    }
+
+    // Var not found
    auto *var = exec_scope->FindVar(name);
    if (var == nullptr) {
      continue;

--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -12,20 +12,173 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <algorithm>
+#include <functional>
 #include <queue>
 #include <string>
+#include <tuple>
 #include <vector>

 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
-#include "paddle/fluid/framework/details/eager_deletion_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
+
 namespace paddle {
 namespace framework {
 namespace details {

+// op -> variables which can be deleted after op runs
+using OpToVarNameSetMap =
+    std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
+
+// Check whether the variable is LoDTensor based on static VarDesc info
+static bool IsLoDTensor(VarDesc *var) {
+  return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
+}
+
+// Get memory size of LoDTensor
+static int64_t GetMemorySize(
+    const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
+    const std::string &var_name) {
+  auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
+  PADDLE_ENFORCE_NOT_NULL(var_desc);
+  PADDLE_ENFORCE(IsLoDTensor(var_desc));
+  auto dims = var_desc->GetShape();
+  return SizeOfType(var_desc->GetDataType()) *
+         std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
+                         std::multiplies<int64_t>());
+}
+
+// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
+// SelectedRows, LoDTensorArray)
+// Since partial GC is based on static analysis of memory size of each variable
+// So we should skip SelectedRows and LoDTensorArray here
+static void SplitIntoLoDTensorAndNonLoDTensorVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
+  lod_tensors->clear();
+  other_vars->clear();
+
+  for (auto &op_vars_pair : m) {
+    for (auto &var_name : op_vars_pair.second) {
+      auto *var_desc = TryGetLatestVarDesc(
+          vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
+      if (IsLoDTensor(var_desc)) {
+        (*lod_tensors)[op_vars_pair.first].insert(var_name);
+      } else {
+        (*other_vars)[op_vars_pair.first].insert(var_name);
+      }
+    }
+  }
+}
+
+struct GCVarInfo {
+  GCVarInfo(const std::string &name, int64_t memory_size,
+            ComputationOpHandle *op, size_t scope_idx)
+      : name_(name),
+        memory_size_(memory_size),
+        op_(op),
+        scope_idx_(scope_idx) {}
+
+  std::string name_;         // variable name
+  int64_t memory_size_;      // memory size
+  ComputationOpHandle *op_;  // op after which the variable could be deleted
+  size_t scope_idx_;         // scope index where the variable locates
+
+  int64_t AbsMemorySize() const { return std::abs(memory_size_); }
+};
+
+// Delete delete_lod_tensor_only is not used currently
+static OpToVarNameSetMap ShrinkGCVars(
+    const OpToVarNameSetMap &m, const GraphVars &vars,
+    const std::vector<platform::Place> &places, double fraction_of_memory_size,
+    bool delete_lod_tensor_only = false) {
+  // Do not perform gc when fraction_of_memory_size = 0
+  if (fraction_of_memory_size <= 0.0) return {};
+
+  /**
+   * Step 1: Split all variables into LoDTensor and Non-LoDTensor.
+   * We can only calculate memory size of LoDTensors
+   */
+  OpToVarNameSetMap lod_tensors, other_vars;
+  SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
+
+  // Perform complete gc when fraction_of_memory_size >= 1
+  if (fraction_of_memory_size >= 1.0) {
+    return delete_lod_tensor_only ? lod_tensors : m;
+  }
+
+  /**
+   * Step 2: build GCVarInfos, and calculate total memory sizes of each device
+   */
+
+  // place -> variable info (name, memory size, place, scope_idx)
+  std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
+
+  // place -> total memory sizes
+  std::map<platform::Place, int64_t> place_to_size;
+  for (auto &op_vars_pair : lod_tensors) {
+    auto *op = op_vars_pair.first;
+    auto &var_names = op_vars_pair.second;
+    auto scope_idx = op->GetScopeIdx();
+    auto &place = places[scope_idx];
+
+    for (auto &var_name : var_names) {
+      auto var_size = GetMemorySize(vars[scope_idx], var_name);
+      GCVarInfo var_info(var_name, var_size, op, scope_idx);
+      place_to_size[place] += var_info.AbsMemorySize();
+      place_to_vars[place].emplace_back(std::move(var_info));
+    }
+  }
+
+  /**
+   * Step 3: sort GCVarInfos, and only delete the largest variables.
+   */
+  OpToVarNameSetMap partial_vars;
+  for (auto &place_to_var_pair : place_to_vars) {
+    auto &place = place_to_var_pair.first;
+    auto &gc_vars = place_to_var_pair.second;
+    std::sort(gc_vars.begin(), gc_vars.end(),
+              [](const GCVarInfo &var1, const GCVarInfo &var2) {
+                return var1.AbsMemorySize() > var2.AbsMemorySize();
+              });
+
+    int64_t accumulated_size = 0;
+    int64_t size_threshold =
+        static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
+    for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
+         ++i) {
+      partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
+      accumulated_size += gc_vars[i].AbsMemorySize();
+    }
+  }
+
+  /**
+   * Step 4: Combine other vars (SelectedRows, LoDTensorArray)
+   */
+  if (!delete_lod_tensor_only) {
+    for (auto &op_vars_pair : other_vars) {
+      partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
+                                              op_vars_pair.second.end());
+    }
+  }
+
+  return partial_vars;
+}
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts =
@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(

  // a reverse map of last_live_ops
  //   i.e., last op --> variable names which can be deleted.
-  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
-      op_vars_map;
-
+  OpToVarNameSetMap op_vars_map;
  for (auto &var_ops_map : last_live_ops) {
    for (auto &var_ops_pair : var_ops_map) {
      const std::string &var_name = var_ops_pair.first;
@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    }
  }

+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+                             FLAGS_memory_fraction_of_eager_deletion);
+
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
    auto &var_names = pair.second;
@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    eager_deletion_op->AddOutput(dummy_leaf);
  }

+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+           << FLAGS_memory_fraction_of_eager_deletion;
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
-  return graph;
+
+  auto while_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
+  return while_op_eager_deletion_pass->Apply(std::move(graph));
 }

 }  // namespace details
@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
    .RequirePassAttr(paddle::framework::details::kAllPlaces)
    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+
+USE_PASS(while_op_eager_deletion_pass);
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -24,12 +26,11 @@ namespace details {

 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    : strategy_(strategy),
      local_scopes_(local_scopes),
      places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
      pool_(strategy.num_threads_),
      prepare_pool_(1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
@@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
  std::vector<FetchOpHandle *> fetch_ops;

  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
@@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
        }
      }
      if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_.ReThrow();
      }
    }
    num_complete += num_comp;
  }
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
  return fetches;
 }


--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
  const ir::Graph &Graph() const override;

@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;

  std::unordered_map<OpHandleBase *, int> op_deps_;
  std::vector<OpHandleBase *> bootstrap_ops_;

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <deque>
 #include <iterator>
+#include <memory>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
+  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // ProgramDescs.
+  // The operations related to BlockDesc or ProgramDesc should perform on Graph
+  // or Node directly!
  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
                 "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -20,6 +20,9 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/cpu_info.h"

@@ -33,10 +36,10 @@ namespace details {
 using paddle::framework::VarDesc;

 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
+  PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
+                 "Graph has no attribute of kStaleProgramOpDescs.");
  // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);

  // 2. topology sort order
  auto nodes = graph.Nodes();
@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {

 bool NodeCanReused(ir::Node* node) {
  // valid the node is a var node
-  if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  if (node == nullptr || !node->IsVar() || node->IsCtrlVar() ||
+      node->Name() == kEmptyVarName)
+    return false;

  bool flag = true;
  // op output force generated in cpu, can not be reused.
@@ -331,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
  // only these types holds bulk of gpu memory
  if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::SELECTED_ROWS ||
        type == proto::VarType::LOD_TENSOR_ARRAY)) {
    return false;
  }
@@ -348,10 +353,6 @@ bool NodeCanReused(const VarDesc& node) {
  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
-  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
-  std::string name = node.Name();
-  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
-    return false;
  return true;
 }

@@ -461,11 +462,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      }
    }
  }
+
+  for (auto* op : ops_) {
+    unlived_vars_[op] = std::set<std::string>();
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
+    }
+  }
 }

 void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
                                           const std::string& new_node,
                                           int begin_idx) {
+  std::vector<bool> need_update(ops_.size(), false);
  // update graph from begin idx to the end
  for (size_t i = begin_idx; i != ops_.size(); ++i) {
    auto* op = ops_[i];
@@ -480,15 +491,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
    if (live_in_[op].find(old_node) != live_in_[op].end()) {
      live_in_[op].erase(old_node);
      live_in_[op].insert(new_node);
+      need_update[i] = true;
    }
    if (live_out_[op].find(old_node) != live_out_[op].end()) {
      live_out_[op].erase(old_node);
      live_out_[op].insert(new_node);
+      need_update[i] = true;
+    }
+  }
+
+  for (size_t i = begin_idx; i < ops_.size(); ++i) {
+    if (!need_update[i]) continue;
+    auto* op = ops_[i];
+    for (auto& var : this->LiveIn(op)) {
+      if (!this->LiveOut(op).count(var)) {
+        unlived_vars_[op].insert(var);
+      }
    }
  }
 }

-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
  auto it = live_in_.find(op);
  PADDLE_ENFORCE(
      it != live_in_.end(),
@@ -496,7 +519,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
  return it->second;
 }

-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
  auto it = live_out_.find(op);
  PADDLE_ENFORCE(
      it != live_out_.end(),
@@ -504,15 +527,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
  return it->second;
 }

-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
  auto it = uses_.find(op);
  PADDLE_ENFORCE(
      it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+      string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
+  auto it = unlived_vars_.find(op);
+  PADDLE_ENFORCE(
+      it != unlived_vars_.end(),
+      string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
+  return it->second;
  return it->second;
 }

-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }

 std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }


--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -92,10 +92,11 @@ class ControlFlowGraph {
  void RenameVarInCFGGraph(const std::string& old_node,
                           const std::string& new_node, int begin_idx);

-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
+  const std::set<std::string>& LiveIn(ir::Node* op) const;
+  const std::set<std::string>& LiveOut(ir::Node* op) const;
+  const std::set<std::string>& Use(ir::Node* op) const;
+  const std::set<std::string>& Unlived(ir::Node* op) const;
+  const std::vector<ir::Node*>& Ops() const;
  std::vector<ir::Node*>& Ops();

  // for ssa-graph nodes
@@ -117,6 +118,7 @@ class ControlFlowGraph {
  VarSetMap live_out_;
  VarSetMap uses_;  // op inputs
  VarSetMap defs_;  // op outputs
+  std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;

  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
  // prepare ir graph
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  ControlFlowGraph cfg(graph);
  cfg.LiveVariableAnalysis();
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  auto nodes = SortOpLikeDescOrder(graph);
  auto op_descs = prog.Block(0).AllOps();
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto nodes = graph.Nodes();
  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
  auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  ir::Graph graph(prog);

  auto find_node_in_graph = [&](std::string s) {
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {

  // cached desc different with real one
  // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();

  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership

  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
    return ret;
  };

+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
+
  // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
  auto nodes = graph.Nodes();
  for (auto node : nodes) {
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();

  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
    return ret;
  };

-  auto op_descs = prog.Block(0).AllOps();
  // add node
  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
@@ -118,13 +119,11 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
    }
    // fill the pool
-    for (auto var : cfg_->LiveIn(op)) {
-      if (cfg_->LiveOut(op).count(var) == 0) {
-        ir::Node* var_node = cfg_->GetNodeByName(var, op);
-        if (var_node == nullptr || var_node->IsCtrlVar()) continue;
-        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
-          pool_.Insert(var_node);
-        }
+    for (auto& var : cfg_->Unlived(op)) {
+      ir::Node* var_node = cfg_->GetNodeByName(var, op);
+      if (var_node == nullptr || var_node->IsCtrlVar()) continue;
+      if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+        pool_.Insert(var_node);
      }
    }
  }
@@ -193,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
          // immediately to make the subblock variable reuse strategy take
          // effect. Because it is a single op in graph. No need to
          // update the ir nodes.
+          // FIXME(liuwei1031): Graph is not aware of the existence of
+          // BlockDescs and ProgramDescs.
+          // The operations related to BlockDesc or ProgramDesc should perform
+          // on Graph or Node directly!
          sub_op_desc->Rename(var->Name(), cache->Name());
          if (sub_op_desc->Block() != nullptr &&
              sub_op_desc->Block()->HasVar(var->Name())) {
@@ -337,4 +340,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,

 REGISTER_PASS(memory_optimize_pass,
              paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
@@ -20,8 +22,7 @@ namespace framework {
 namespace details {

 std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
-    std::unique_ptr<ir::Graph> &&graph) {
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
  std::vector<std::unique_ptr<ir::Graph>> graphs;
  graphs.reserve(places_.size());
  for (size_t i = 0; i < places_.size(); ++i) {
@@ -30,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
    auto &g = graphs.back();
    g->Set(kGraphVars, new GraphVars(1UL));
    g->Set(kGraphDepVars, new GraphDepVars);
+    auto &stale_ops =
+        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
+    g->Erase(details::kStaleProgramOpDescs);
+    g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
+                                        new std::vector<OpDesc *>(stale_ops));
  }
  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);

@@ -77,24 +83,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(

 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    : strategy_(std::move(strategy)),
      local_scopes_(std::move(local_scopes)),
      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
      places_(std::move(places)),
-      main_prog_(main_prog),
      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
      // attrs.
-      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
+      graphs_(SeparateMultiDevicesGraph(graph)) {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());

  auto seq_allreduce_pass =
      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Erase(details::kAllOpDescs);
-  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
-      details::kAllOpDescs,
-      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
  for (size_t i = 0; i < graphs_.size(); ++i) {
    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
  }
@@ -107,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
          << " to run the operators of the graph on each device.";
  for (size_t i = 0; i < places.size(); ++i) {
    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
  }
 }


--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           const framework::ProgramDesc &main_prog,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
  ~ParallelSSAGraphExecutor() final = default;

  const ir::Graph &Graph() const override { return *graphs_[0]; }
@@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {

 private:
  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      std::unique_ptr<ir::Graph> &&graph);
+      ir::Graph *graph);

  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::unique_ptr<::ThreadPool> pool_{nullptr};
  std::vector<platform::Place> places_;
-  framework::ProgramDesc main_prog_;
  std::vector<std::unique_ptr<ir::Graph>> graphs_;

  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
 #include <queue>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/details/computation_op_handle.h"
@@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
  return shrink_func(computation_op);
 }

-static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
-  VarDesc *var_desc = nullptr;
-  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
-    var_desc = var_handle->Node()->Var();
-    return var_desc != nullptr;
-  });
-  return var_desc;
-}
-
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);

--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@@ -13,9 +13,22 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/var_desc.h"

 namespace paddle {
 namespace framework {
-namespace details {}  // namespace details
+namespace details {
+
+VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
+}
+
+}  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -16,6 +16,7 @@

 #include <atomic>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -25,6 +26,10 @@

 namespace paddle {
 namespace framework {
+
+class VarDesc;
+class VarHandle;
+
 namespace details {

 class ComputationOpHandle;
@@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector";
 const char kAllPlaces[] = "all_places";

 using LastLiveOpsOfVars =
-    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle *>>;
 const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";

+VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars);
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
  static std::unordered_set<std::string> skip_dist_ops{
      "send", "recv", "send_barrier", "fetch_barrier"};

-  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
  std::vector<ir::Node *> op_node_list;
  op_node_list.reserve(ops.size());

@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(

 REGISTER_PASS(sequential_execution_pass,
              paddle::framework::details::SequentialExecutionPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
-    : graph_(std::move(graph)),
+    const std::vector<platform::Place> &places, ir::Graph *graph)
+    : graph_(graph),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
      local_scopes_(local_scopes),
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_holder_.ReThrow();
      } else {
        continue;
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  }
  PADDLE_ENFORCE(ready_ops.empty());
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);

  return fetch_data;
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);

  const ir::Graph &Graph() const override { return *graph_; }
  // Run a SSAGraph by a thread pool
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);

 private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

--- a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class WhileOpEagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+    // Find all while_op and while_grad_op
+    std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
+                                         std::vector<OperatorBase *>>>
+        target_ops;
+    for (auto *op : all_ops) {
+      auto compute_op = dynamic_cast<ComputationOpHandle *>(op);
+      if (compute_op == nullptr) continue;
+
+      if (compute_op->Name() == "while") {
+        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
+            compute_op->GetOp());
+      } else if (compute_op->Name() == "while_grad") {
+        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
+            compute_op->GetOp());
+      }
+    }
+
+    for (auto &ops_pair : target_ops) {
+      auto &while_ops = ops_pair.second.first;
+      auto &while_grad_ops = ops_pair.second.second;
+      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
+          while_ops, while_grad_ops);
+    }
+    return graph;
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(while_op_eager_deletion_pass,
+              paddle::framework::details::WhileOpEagerDeletionPass);
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,25 +14,31 @@ limitations under the License. */

 #include "paddle/fluid/framework/executor.h"
 #include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

 #ifdef PADDLE_WITH_NGRAPH
 #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
 #endif

 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
-DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");

 namespace paddle {
 namespace framework {
@@ -74,11 +80,11 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(

 ExecutorPrepareContext::ExecutorPrepareContext(
    const framework::ProgramDesc& prog, size_t block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars)
-    : prog_(prog), block_id_(block_id) {
-  if (GetEagerDeletionThreshold() >= 0) {
-    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
-                                                        skip_ref_cnt_vars);
+    const std::vector<std::string>& keep_vars, bool force_disable_gc)
+    : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
+  if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
+    global_ref_cnts_ =
+        GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
  }
 }

@@ -183,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 }

 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope, bool create_vars) {
+                   bool create_local_scope, bool create_vars,
+                   const std::vector<std::string>& skip_ref_cnt_vars,
+                   bool force_disable_gc) {
  platform::RecordBlock b(block_id);
  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
-#endif
-  auto ctx = Prepare(pdesc, block_id);
+  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }

@@ -356,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
+    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
+  std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
+      program, block_id, skip_ref_cnt_vars, force_disable_gc));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) {
+    paddle::operators::NgraphEngine::FuseNgraphOps(
+        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
+  }
+#endif
  return ctx;
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
    const ProgramDesc& program, const std::vector<int>& block_ids,
-    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars,
+    bool force_disable_gc) {
  PADDLE_ENFORCE(
      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
      "skip_ref_cnt_vars should be either empty or equals to block number %d",
@@ -379,9 +391,11 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  for (auto& bid : block_ids) {
    ExecutorPrepareContext* ctx;
    if (skip_ref_cnt_vars.empty()) {
-      ctx = new ExecutorPrepareContext(program, bid);
+      ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
+                                       force_disable_gc);
    } else {
-      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
+                                       force_disable_gc);
    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
@@ -408,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,

  int64_t max_memory_size = GetEagerDeletionThreshold();
  std::unique_ptr<GarbageCollector> gc;
-  // skip while_op and while_grad_op temporarily
-  if (max_memory_size >= 0 && !keep_kids) {
+  // FIXME(zjl): recurrent_op is rather complex, we would
+  // disable gc forcely in recurrent_op
+  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
@@ -427,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 #ifdef PADDLE_WITH_CUDA
    }
 #endif
+    // If gc is enabled and block size > 1
+    if (gc && ctx->prog_.Size() > 1) {
+      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
+                                                                 ctx->ops_);
+    }
  }

  for (auto& op : ctx->ops_) {

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <map>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -30,7 +32,8 @@ namespace framework {
 struct ExecutorPrepareContext {
  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
                         const std::vector<std::string>& skip_ref_cnt_vars =
-                             std::vector<std::string>());
+                             std::vector<std::string>(),
+                         bool force_disable_gc = false);

  ~ExecutorPrepareContext();

@@ -38,6 +41,7 @@ struct ExecutorPrepareContext {

  const framework::ProgramDesc& prog_;
  size_t block_id_;
+  bool force_disable_gc_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;

  std::unordered_map<std::string, size_t> global_ref_cnts_;
@@ -66,7 +70,10 @@ class Executor {
   *  Scope
   */
  void Run(const ProgramDesc& prog, Scope* scope, int block_id,
-           bool create_local_scope = true, bool create_vars = true);
+           bool create_local_scope = true, bool create_vars = true,
+           const std::vector<std::string>& skip_ref_cnt_vars =
+               std::vector<std::string>(),
+           bool force_disable_gc = false);

  // This API is very slow.
  void Run(const ProgramDesc& program, Scope* scope,
@@ -79,12 +86,14 @@ class Executor {
  static std::unique_ptr<ExecutorPrepareContext> Prepare(
      const ProgramDesc& program, int block_id,
      const std::vector<std::string>& skip_ref_cnt_vars =
-          std::vector<std::string>());
+          std::vector<std::string>(),
+      bool force_disable_gc = false);

  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
      const ProgramDesc& program, const std::vector<int>& block_ids,
      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
-          std::vector<std::vector<std::string>>());
+          std::vector<std::vector<std::string>>(),
+      bool force_disable_gc = false);

  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,8 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_pass inference)
+pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -66,6 +68,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
+pass_library(sync_batch_norm_pass base)

 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -100,9 +103,15 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
+cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
+if(NOT WIN32)
+    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+endif()
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
 endif ()
--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+}  // namespace
+
+enum { U8_MAX = 255, S8_MAX = 127 };
+
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
+using string::PrettyLogDetail;
+
+void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
+                                    std::string input_name, double scale_to_one,
+                                    bool is_unsigned,
+                                    std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize");
+  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
+  q_desc.SetOutput("Output",
+                   std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("Scale", scale);
+  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+  // update op's input
+  op->Op()->SetInput(input_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
+                                       std::string output_name,
+                                       double scale_to_one, bool is_unsigned,
+                                       std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  // create a dequantize op node for output.
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("Scale", scale);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  // update op's output
+  op->Op()->SetOutput(output_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   bool with_residual_data) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ConvResidual conv_pattern{pattern, name_scope_};
+  conv_pattern(with_residual_data);
+
+  int quantize_conv_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize conv2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    auto* conv_op_desc = conv_op->Op();
+
+    // skip if should not be quantized
+    if (!conv_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[conv_input->Name()].first;
+    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
+                  is_input_unsigned, "Scale_in");
+
+    auto filter_scale_tensor = scales[conv_filter->Name()].second;
+    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
+                                     filter_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        filter_scale_tensor.data<double>(),
+        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
+
+    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
+
+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
+                                conv_pattern);
+      auto residual_scale =
+          scales[conv_residual_data->Name()].second.data<double>()[0];
+      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
+
+      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
+                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
+    }
+
+    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[conv_output->Name()].first;
+    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
+                     is_output_unsigned, "Scale_out");
+
+    ++quantize_conv_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_conv_count);
+
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
+  if (with_residual_data) msg_ss << " with residual connection";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+
+void CPUQuantizePass::QuantizePool(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Pool pool_pattern{pattern, name_scope_};
+  pool_pattern();
+
+  int quantize_pool_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize pool2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
+    auto* pool_op_desc = pool_op->Op();
+
+    // skip if should not be quantized
+    if (!pool_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[pool_input->Name()].first;
+    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
+
+    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[pool_output->Name()].first;
+    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_pool_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_pool_count);
+
+  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Quantizing the graph.";
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  PADDLE_ENFORCE(param_scope());
+
+  QuantizeConv(graph.get(), true /* with_residual_data */);
+  QuantizeConv(graph.get());
+  QuantizePool(graph.get());
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
+    .RequirePassAttr("quant_var_scales");
--- a/paddle/fluid/framework/ir/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+
+/*
+ * Quantize all supported operators.
+ */
+class CPUQuantizePass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+
+  void QuantizePool(Graph* graph) const;
+
+  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
+                     double scale_to_one, bool is_unsigned,
+                     std::string scale_attr_name = "") const;
+
+  void DequantizeOutput(Graph* g, Node* op, Node* output,
+                        std::string output_name, double scale_to_one,
+                        bool is_unsigned,
+                        std::string scale_attr_name = "") const;
+
+  const std::string name_scope_{"quantize"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           bool use_quantizer = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+    if (inputs.size() > 3) {
+      op->SetInput("ResidualData", {inputs[3]});
+      op->SetAttr("fuse_residual_connection", true);
+    } else {
+      op->SetInput("ResidualData", {});
+      op->SetAttr("fuse_residual_connection", false);
+    }
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Out", {outputs[0]});
+  }
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
+    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+// (a,w1)->Conv1->c and c->Pool1->d
+//
+// (d,w2)->Conv2->e and e->Pool2->f
+//
+// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+//
+// (d,w4, b2)->Conv4->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
+        use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
+        use_quantizer);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int quant_count, int dequant_count, int added_nodes_count,
+              float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  auto* scales = new VarQuantScale();
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
+            << "Scale_in for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+        EXPECT_EQ(
+            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
+            scale)
+            << "Scale_weights for node '" + op_name + "'.";
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = true;
+  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
+  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
+  //
+  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
+  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
+  //
+  // d->Dropout1->g and g->Fc1->h and
+  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
+  //
+  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
+  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
+  int added_nodes = 7 + 7 + 6 + 6;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+           2.0f * 127);
+}
+
+TEST(CpuQuantizePass, do_not_quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = false;
+  int added_nodes = 0;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
+           1.0f);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_pass);
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file eint8_outcept in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
+// implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUQuantizeSquashPass::FindNodesToKeep(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
+  deq_any_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
+
+    if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
+      (*nodes_keep_counter)[dequant_out] = 1;
+    else
+      (*nodes_keep_counter)[dequant_out] += 1;
+
+    found_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
+void CPUQuantizeSquashPass::Squash(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
+  squash_pattern();
+
+  int found_squash_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "squash requantize-quantize ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
+
+    auto* next_op_desc = next_op->Op();
+    float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
+    float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
+    PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
+                   nodes_keep_counter->end());
+
+    // check if dequantize op should be kept or removed, decrease the counter
+    bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
+
+    if (dequant_scale == quant_scale) {
+      // squash dequantize-quantize to nothing
+      auto quant_out_var_name = quant_out->Name();
+      auto next_op_inputs = next_op_desc->InputNames();
+      for (const auto& name : next_op_inputs) {
+        auto var_name = next_op_desc->Input(name)[0];
+        if (var_name.compare(quant_out_var_name) == 0) {
+          next_op_desc->SetInput(
+              name, std::vector<std::string>({dequant_in->Name()}));
+          break;
+        }
+      }
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+      else
+        GraphSafeRemoveNodes(graph,
+                             {dequant_op, quant_op, dequant_out, quant_out});
+
+      IR_NODE_LINK_TO(dequant_in, next_op);
+
+      found_squash_count++;
+    } else {
+      // squash dequantize-quantize to requantize op
+      OpDesc desc;
+      desc.SetType("requantize");
+      desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
+      desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
+      desc.SetAttr("Scale_in", dequant_scale);
+      desc.SetAttr("Scale_out", quant_scale);
+
+      auto requant_op = g->CreateOpNode(&desc);
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op});
+      else
+        GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
+
+      IR_NODE_LINK_TO(dequant_in, requant_op);
+      IR_NODE_LINK_TO(requant_op, quant_out);
+
+      found_squash_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_squash_count);
+  PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
+                  found_squash_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
+
+  std::unordered_map<const Node*, int> nodes_keep_counter;
+  FindNodesToKeep(graph.get(), &nodes_keep_counter);
+  Squash(graph.get(), &nodes_keep_counter);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_squash_pass,
+              paddle::framework::ir::CPUQuantizeSquashPass);
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Squash dequantize->quantize pair pattern into requantize op
+ */
+class CPUQuantizeSquashPass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizeSquashPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  /*
+   * For each dequantize's output find the number of operators it is an input to
+   */
+  void FindNodesToKeep(
+      Graph* graph,
+      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  /*
+   * Squash dequantize-quantize ops pairs into requantize or nothing
+   */
+  void Squash(Graph* graph,
+              std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  const std::string name_scope_{"squash"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           float scale = 0) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Output", {outputs[0]});
+  } else if (type == "quantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  } else if (type == "dequantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", scale);
+  }
+}
+
+// (a,w1,b1)->Conv1->d
+// d->Dequant->e
+// e->Quant->f
+// (f,w2,b2)->Conv2->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
+  ProgramDesc prog;
+  for (auto& v : std::initializer_list<std::string>(
+           {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
+  SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
+  return prog;
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "b", "c", "d", "e", "f", "g", "h"};
+// a->Conv1->b
+// b->Dequant->c
+//
+// c->Quant1->d and d->Conv2->e
+//
+// c->Conv3->f
+//
+// c->Quant2->g and g->Conv4->h
+//
+ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
+                              float scale3) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
+  SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
+
+  SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
+  SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
+
+  SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
+
+  SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
+  SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
+}
+
+TEST(CpuQuantizeSquashPass, equal_scales) {
+  auto scale = 1.2345f;
+  auto use_mkldnn = true;
+  // Remove 4 nodes: Dequant, Quant, e, f
+  auto remove_nodes = 4;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, inequal_scales) {
+  auto scale1 = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Dequant, Quant, e
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
+}
+
+TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
+  // Delete both quantize ops,
+  // bypass dequantize in both branches,
+  // insert requantize on one branch
+  auto scale = 1.2345f;
+  auto scale2 = 21.0f;
+  auto use_mkldnn = true;
+  // Remove 3 nodes: Quant1, Quant2, g
+  // Insert 1 node: requantize
+  auto remove_nodes = 2;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+
+  use_mkldnn = !use_mkldnn;
+  MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_squash_pass);
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {

 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managed by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";

 enum FuseOptions {
  DO_NOT_FUSE,  // fusing will not be done

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <algorithm>
-#include <unordered_set>
+#include <unordered_map>

 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -76,6 +76,9 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
      var->inputs.push_back(node);
    }
  }
+  Set<const std::vector<OpDesc *>>(
+      details::kStaleProgramOpDescs,
+      new std::vector<OpDesc *>(program.Block(0).AllOps()));
  return var_nodes;
 }

@@ -149,6 +152,39 @@ void Graph::ResolveHazard(
  }
 }

+std::shared_ptr<Graph> Graph::Clone() {
+  auto cloned_graph = std::make_shared<Graph>(this->program_);
+  cloned_graph->ReleaseNodes();
+  cloned_graph->num_node_created_ = 0;
+  std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
+  for (auto *n : this->node_set_) {
+    ir::Node *cloned_node = nullptr;
+    if (n->IsCtrlVar()) {
+      cloned_node = cloned_graph->CreateControlDepVar();
+    } else if (!n->var_desc_ && !n->op_desc_) {  // empty node
+      cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
+    } else if (n->IsVar()) {
+      cloned_node = cloned_graph->CreateVarNode(n->Var());
+    } else if (n->IsOp()) {
+      cloned_node = cloned_graph->CreateOpNode(n->Op());
+    }
+    if (cloned_node) {
+      origin_to_cloned[n] = cloned_node;
+    } else {
+      PADDLE_THROW("The cloned node's type is not supported!");
+    }
+  }
+  for (auto *n : this->node_set_) {
+    for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
+      origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
+    }
+    for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
+      origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
+    }
+  }
+  return cloned_graph;
+}
+
 bool IsControlDepVar(const ir::Node &var) {
  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/ir/node.h"
@@ -31,7 +32,7 @@ namespace details {

 // This attr is not recommended, because the graph should not dependence
 // the program once it is built.
-constexpr char kAllOpDescs[] = "all_op_descs";
+constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
 }  //  namespace details

 namespace ir {
@@ -195,6 +196,17 @@ class Graph {
    return nullptr;
  }

+  // Returns reference to the original program.
+  // WARN: After a series of passes, the current graph can be quite
+  // different from OriginProgram. Caller shouldn't assume much from
+  // the returned OriginProgram.
+  const ProgramDesc &OriginProgram() const {
+    LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
+                    "quite different from OriginProgram. So, please avoid "
+                    "using the `OriginProgram()` method!";
+    return program_;
+  }
+
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
@@ -206,6 +218,10 @@ class Graph {
  void ResolveHazard(
      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);

+  // Create a new and duplicated graph.
+  // WARN: The method only clones the graph structure, not its attributes.
+  std::shared_ptr<Graph> Clone();
+
 private:
  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
      const ProgramDesc &program);

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    if (adj_list.find(n) == adj_list.end()) {
      adj_list[n] = std::unordered_set<ir::Node *>();
    }
+    std::vector<ir::Node *> nodes;
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                << " -> " << n->Name() << reinterpret_cast<void *>(n)
                << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        adj_list[n].insert(adj_n);
+        nodes.push_back(adj_n);
      }
    }
+    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
+      return node1->id() > node2->id();
+    });
+    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
+                       std::make_move_iterator(nodes.end()));
  }
  return adj_list;
 }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
  ValidateByNodeRole(&subgraphs);

  if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
+                subgraphs.size());
  int id = 0;
  for (auto &g : subgraphs) {
    VLOG(3) << "optimizing #" << id++ << " subgraph";
@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() {
                        ->AsOutput()
                        ->assert_is_op_output("conv2d", "Output");

-  conv_op->LinksFrom({input_var, filter_var});
-  conv_op->LinksTo({output_var});
+  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+
+  if (!with_residual_data)
+    conv_op->assert_op_attr("fuse_residual_connection", false);
+
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+
+  std::vector<PDNode *> links_from{input_var, filter_var};
+
+  if (with_residual_data) {
+    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("conv2d", "ResidualData");
+    links_from.push_back(res_conn_var);
+  }
+
+  conv_op->LinksFrom(links_from).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::Pool::operator()() {
+  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
+
+  auto input_var = pattern->NewNode(pool_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("pool2d", "X");
+
+  auto output_var = pattern->NewNode(pool_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("pool2d", "Out");

+  pool_op->LinksFrom({input_var}).LinksTo({output_var});
  return output_var;
 }

@@ -1301,6 +1346,51 @@ PDNode *patterns::ConvAffineChannel::operator()(
  return ac_out_var;
 }

+PDNode *patterns::DequantQuantAny::operator()() {
+  auto *dequant_in = pattern->NewNode(dequant_in_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("dequantize", "Input");
+
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *quant_op = pattern->NewNode(quant_op_repr())
+                       ->assert_is_op("quantize")
+                       ->AsIntermediate();
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("quantize");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
+  next_op->LinksFrom({quant_out});
+
+  return quant_out;
+}
+
+PDNode *patterns::DequantAny::operator()() {
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize", "Output");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksTo({dequant_out});
+  next_op->LinksFrom({dequant_out});
+
+  return dequant_out;
+}
+
 // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
 // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
 // ...

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -18,8 +18,11 @@
 #include <gtest/gtest_prod.h>
 #endif

+#include <memory>
 #include <numeric>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -656,6 +659,35 @@ struct Conv : public PatternBase {
  PATTERN_DECL_NODE(conv_output);
 };

+// Convolution op with residual data
+struct ConvResidual : public PatternBase {
+  ConvResidual(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_residual") {}
+
+  PDNode* operator()(bool with_residual_data);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+
+// Pool op
+// Forward pass for pooling.
+// pool_input is the input.
+// pool_output is a result of the operator.
+struct Pool : public PatternBase {
+  Pool(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "pooling") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(pool_op);
+  PATTERN_DECL_NODE(pool_input);
+  PATTERN_DECL_NODE(pool_output);
+};
+
 // ElementwiseAdd used in residual connections.
 // y_var is used and convolution output.
 // The operator is removed, when residual
@@ -766,6 +798,34 @@ struct ConvAffineChannel : public PatternBase {
  PATTERN_DECL_NODE(ac_out);  // Out
 };

+// Dequantize + Quantize + anyOP
+// This pattern is used for squashing the dequantize-quantize pairs.
+struct DequantQuantAny : public PatternBase {
+  DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_quant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_in);
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Dequantize + anyOP
+// This quantize is used for getting number of ops the Dequantize's
+// output is an input to.
+struct DequantAny : public PatternBase {
+  DequantAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
 struct TransposeFlattenConcat : public PatternBase {
  TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}

--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -44,10 +44,14 @@ struct TestIsReachable {
  using func = std::function<bool(const std::string&, const std::string&)>;

  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
-    auto find_node = [](const std::unique_ptr<ir::Graph>& graph,
-                        const std::string& name) -> Node* {
+    auto hash = [](const Node* node) -> std::string {
+      return node->Name() + std::to_string(node->id());
+    };
+
+    auto find_node = [&](const std::unique_ptr<ir::Graph>& graph,
+                         const std::string& name) -> Node* {
      for (auto& node : GraphTraits::DFS(*graph)) {
-        if (name == node.Name()) {
+        if (name == hash(&node)) {
          return &node;
        }
      }
@@ -55,13 +59,17 @@ struct TestIsReachable {
      return nullptr;
    };

-    return [&](std::string from, const std::string to) -> bool {
+    // update the from and to strings to hashed equivs in loop from graph traits
+    return [&](std::string from, std::string to) -> bool {
      if (from == to) return true;

      std::map<std::string, bool> visited;

      for (auto& node : GraphTraits::DFS(*graph)) {
-        visited[node.Name()] = false;
+        auto hashed = hash(&node);
+        if (node.Name() == from) from = hashed;
+        if (node.Name() == to) to = hashed;
+        visited[hashed] = false;
      }

      visited[from] = true;
@@ -72,15 +80,15 @@ struct TestIsReachable {
      while (!queue.empty()) {
        auto cur = find_node(graph, queue.front());
        queue.pop_front();
-
        if (cur == nullptr) return false;

        for (auto n : cur->outputs) {
-          if (n->Name() == to) return true;
+          auto hashed_name = hash(n);
+          if (hashed_name == to) return true;

-          if (!visited[n->Name()]) {
-            visited[n->Name()] = true;
-            queue.push_back(n->Name());
+          if (!visited[hashed_name]) {
+            visited[hashed_name] = true;
+            queue.push_back(hashed_name);
          }
        }
      }
@@ -166,6 +174,28 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
  RunPassAndAssert(&prog, "a", "relu", 1);
 }

+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionProjectionAsYWithElementwiseAddRelu) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"},
+                               {"bias", "weights", "bias2", "weights2"});
+
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  // right branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});
+
+  // left branch
+  SetOp(&prog, "conv2d",
+        {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}},
+        {"Output", "f"});
+
+  SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
+
+  RunPassAndAssert(&prog, "a", "relu", 2);
+}
+
 TEST(ConvElementwiseAddMKLDNNFusePass,
     ConvolutionAsYWithElementwiseAddReluNoBias) {
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -21,7 +21,7 @@ namespace ir {

 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  VLOG(3) << "Applies MKL-DNN placement strategy.";
  const auto& op_types_list =
      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
  for (const Node* n : graph->Nodes()) {

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, boost::tribool use_mkldnn) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_mkldnn
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     none
+// f->relu->g                    false
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   true
+// k->relu->l                    true
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", std::vector<std::string>({"a", "b"}),
+        std::vector<std::string>({"c"}), boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1",
+        std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}), boost::indeterminate);
+  SetOp(&prog, "relu", "relu1", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}), false);
+  SetOp(&prog, "pool2d", "pool1", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false);
+  SetOp(&prog, "conv2d", "conv2",
+        std::vector<std::string>({"h", "weights2", "bias2"}),
+        std::vector<std::string>({"k"}), true);
+  SetOp(&prog, "relu", "relu2", std::vector<std::string>({"k"}),
+        std::vector<std::string>({"l"}), true);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> mkldnn_enabled_op_types,
+              unsigned expected_use_mkldnn_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass");
+  pass->Set("mkldnn_enabled_op_types",
+            new std::unordered_set<std::string>(mkldnn_enabled_op_types));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_mkldnn_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_mkldnn") &&
+          boost::get<bool>(op->GetAttr("use_mkldnn"))) {
+        ++use_mkldnn_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count);
+}
+
+TEST(MKLDNNPlacementPass, enable_conv_relu) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
+  MainTest({"conv2d", "relu"}, 3);
+}
+
+TEST(MKLDNNPlacementPass, enable_relu_pool) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({"relu", "pool2d"}, 4);
+}
+
+TEST(MKLDNNPlacementPass, enable_all) {
+  // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
+  MainTest({}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(mkldnn_placement_pass);
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
 #include <typeindex>
 #include <typeinfo>

--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Use synchronous batch norm";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() == "batch_norm") {
+        op->SetType("sync_batch_norm");
+      }
+      if (op->Type() == "batch_norm_grad") {
+        op->SetType("sync_batch_norm_grad");
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class SyncBatchNormPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+}
+
+// (a, conv_w)->conv2d->b
+// (b, bn_scale, bn_bias, mean, var)->batch_norm
+//     ->(c, mean, var, save_mean, save_inv_var)
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
+                                           "bn_bias", "mean", "var", "c",
+                                           "save_mean", "save_inv_var"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
+        v == "var") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "batch_norm", "bn",
+        std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
+        std::vector<std::string>(
+            {"c", "mean", "var", "save_mean", "save_inv_var"}));
+  return prog;
+}
+
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
+
+  graph = pass->Apply(std::move(graph));
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "bn") {
+        ASSERT_EQ(op->Type(), "sync_batch_norm");
+      }
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(sync_batch_norm_pass);
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -290,7 +290,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
  extern int                                                               \
      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()

 #define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    VLOG(3) << place << " " << DebugStringEx(&scope);
  } catch (platform::EnforceNotMet exception) {
    if (Attrs().count("sub_block") != 0) {
-      throw;
+      throw std::move(exception);
    }

    auto& callstack = Attr<std::vector<std::string>>(
        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());

    if (callstack.empty()) {
-      throw;
+      throw std::move(exception);
    }
    std::ostringstream sout;
    sout << "Invoke operator " << Type() << " error.\n";
@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    sout << "C++ Callstacks: \n";
    sout << exception.err_str_;
    exception.err_str_ = sout.str();
-    throw;
+    throw std::move(exception);
  } catch (...) {
    std::rethrow_exception(std::current_exception());
  }
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-const Variable* ExecutionContext::LegacyInputVar(
-    const std::string& name) const {
-  auto ipt = op_.Input(name);
-  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-}
-
 Variable* ExecutionContext::OutputVar(const std::string& name) const {
  auto it = ctx_.outputs.find(name);
  if (it == ctx_.outputs.end()) return nullptr;
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
-  auto opt = op_.Output(name);
-  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-}
-
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  return Input<LoDTensor>(name);
 }

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const {
-  return LegacyInput<LoDTensor>(name);
-}
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
  return res;
 }

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const {
-  auto names = op().Inputs(name);
-  std::vector<const Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) -> const Tensor* {
-                   auto var = scope_.FindVar(sub_name);
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, ToTypeName(var->Type()));
-                   return &(var->Get<LoDTensor>());
-                 });
-  return res;
-}
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  return Output<LoDTensor>(name);
 }

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
-  return LegacyOutput<LoDTensor>(name);
-}
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {
@@ -882,7 +841,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const RuntimeContext& ctx_;
 };

-static void CheckTensorNANOrInf(const std::string& name,
+static void CheckTensorNANOrInf(const std::string& op_type,
+                                const std::string& name,
                                const framework::Tensor& tensor) {
  if (tensor.memory_size() == 0) {
    return;
@@ -892,9 +852,9 @@ static void CheckTensorNANOrInf(const std::string& name,
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
+                 "Operator %s output Tensor %s contains Inf", op_type, name);
  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
+                 "Operator %s output Tensor %s contains NAN", op_type, name);
 }

 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -904,6 +864,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
 }

+std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
+    const OpKernelType& key) const {
+  auto config_iter = kernel_configs_map_.find(key);
+  std::vector<KernelConfig>* kernel_configs = nullptr;
+  if (config_iter != kernel_configs_map_.end()) {
+    kernel_configs = &(config_iter->second);
+  }
+  return kernel_configs;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeContext ctx(Inputs(), Outputs(), scope);
@@ -921,7 +891,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  OpKernelMap& kernels = kernels_iter->second;

  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -940,6 +910,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                 KernelTypeToString(expected_kernel_key));
  }

+  std::vector<KernelConfig>* kernel_configs =
+      GetKernelConfig(expected_kernel_key);
+
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
  auto* transfer_scope =
@@ -953,11 +926,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    dev_ctx = pool.Get(expected_kernel_key.place_);
  }

-  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
-  this->InferShape(&infer_shape_ctx);
+  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+    this->InferShape(&infer_shape_ctx);
+  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
+  kernel_iter->second(
+      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));

  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.
@@ -974,9 +950,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
      auto* var = exec_scope.FindVar(vname);
      if (var == nullptr) continue;
      if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
      } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
+        CheckTensorNANOrInf(type_, vname,
+                            var->Get<framework::SelectedRows>().value());
      }
    }
  }

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -16,9 +16,11 @@ limitations under the License. */

 #include <algorithm>
 #include <atomic>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 #include "glog/logging.h"  // For VLOG
@@ -28,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -59,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";

+/// If an Op has this attribute, all its kernels should calculate output
+/// variable's shape in the corresponding Compute() function. And
+/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
+/// function in its runtime for speedup.
+/// TODO(luotao): Note that this temporal attribute would be deleted after all
+/// ops contain it.
+constexpr char kAllKernelsMustComputeRuntimeShape[] =
+    "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
@@ -184,12 +196,30 @@ class OperatorBase {
                       const platform::Place& place) const = 0;
 };

+#ifdef PADDLE_WITH_CUDA
+using KernelConfig = boost::variant<
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>,
+    std::shared_ptr<AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>>;
+#else
+using KernelConfig = boost::variant<boost::blank>;
+#endif
+
+using OpKernelConfigsMap =
+    std::unordered_map<OpKernelType, std::vector<KernelConfig>,
+                       OpKernelType::Hash>;
+
 class ExecutionContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
                   const platform::DeviceContext& device_context,
-                   const RuntimeContext& ctx)
-      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
+                   const RuntimeContext& ctx,
+                   std::vector<KernelConfig>* configs)
+      : op_(op),
+        scope_(scope),
+        device_context_(device_context),
+        ctx_(ctx),
+        kernel_configs_(configs) {}

  const OperatorBase& op() const { return op_; }

@@ -234,31 +264,6 @@ class ExecutionContext {
    return it->second;
  }

-  const std::vector<Variable*> LegacyMultiInputVar(
-      const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
-  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
  template <typename T>
  const T* Input(const std::string& name) const {
    auto* var = InputVar(name);
@@ -271,22 +276,6 @@ class ExecutionContext {
    return var == nullptr ? nullptr : var->GetMutable<T>();
  }

-  template <typename T>
-  const T* LegacyInput(const std::string& name) const {
-    auto* var = LegacyInputVar(name);
-    return var == nullptr ? nullptr : &var->Get<T>();
-  }
-
-  template <typename T>
-  T* LegacyOutput(const std::string& name) const {
-    auto var = LegacyOutputVar(name);
-    return var == nullptr ? nullptr : var->GetMutable<T>();
-  }
-
-  const Variable* LegacyInputVar(const std::string& name) const;
-
-  Variable* LegacyOutputVar(const std::string& name) const;
-
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
@@ -319,32 +308,6 @@ class ExecutionContext {
    return res;
  }

-  template <typename T>
-  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<const T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> const T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : &var->Get<T>();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : var->GetMutable<T>();
-                   });
-    return res;
-  }
-
  platform::Place GetPlace() const { return device_context_.GetPlace(); }

  template <typename DeviceContextType>
@@ -398,34 +361,32 @@ class ExecutionContext {
    return temp_tensor;
  }

+  template <typename T>
+  T& GetKernelConfig(int idx) const {
+    PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx,
+                   "%s selected kernel doesn't have kernel config %lu <= %d",
+                   op_.Type().c_str(), kernel_configs_->size(), idx);
+    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
+  }
+
 private:
  const OperatorBase& op_;
  const Scope& scope_;
  const platform::DeviceContext& device_context_;
  const RuntimeContext& ctx_;
+  mutable std::vector<KernelConfig>* kernel_configs_;
 };

 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const;
@@ -483,6 +444,8 @@ class OperatorWithKernel : public OperatorBase {

  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;

+  std::vector<KernelConfig>* GetKernelConfig(const OpKernelType& key) const;
+
 protected:
  virtual OpKernelType GetKernelTypeForVar(
      const std::string& var_name, const Tensor& tensor,
@@ -508,6 +471,9 @@ class OperatorWithKernel : public OperatorBase {
  void TransferInplaceVarsBack(const Scope& scope,
                               const std::vector<std::string>& inplace_vars,
                               const Scope& exec_scope) const;
+
+ protected:
+  mutable OpKernelConfigsMap kernel_configs_map_;
 };

 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+// Not thread-safe. Should be owned per-kernel.
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  int search_times_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -14,8 +14,10 @@ limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"

@@ -181,12 +183,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
 }

-ParallelExecutor::ParallelExecutor(
-    const std::vector<platform::Place> &places,
-    const std::unordered_set<std::string> &bcast_vars,
-    const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
+ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
+                                   const std::vector<std::string> &bcast_vars,
+                                   const std::string &loss_var_name,
+                                   Scope *scope,
+                                   const std::vector<Scope *> &local_scopes,
+                                   const ExecutionStrategy &exec_strategy,
+                                   const BuildStrategy &build_strategy,
+                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
@@ -216,11 +220,13 @@ ParallelExecutor::ParallelExecutor(
    }
  }

+  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
+
  // FIXME(Yancey1989): parallel graph mode get better performance
  // in GPU allreduce distributed training. Need an elegant way to
  // choice the execution strategy.
-  build_strategy.enable_parallel_graph_ =
-      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+  build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution(
+      *temp_owned_graph, exec_strategy, build_strategy);
  if (build_strategy.enable_parallel_graph_)
    VLOG(0) << "The Executor would execute the graph by ParallelGraph "
               "Execution which can get better performance,"
@@ -247,33 +253,67 @@ ParallelExecutor::ParallelExecutor(
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
        member_->places_, nccl_id, build_strategy.num_trainers_,
        build_strategy.trainer_id_));
+
+    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
+    dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
+    // Initialize device context's nccl comm
+    // Note, more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif
  }
-  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToDevices(bcast_vars);
+  // broadcast parameters from the 0th device to others:
+  auto need_broadcast = [&]() -> bool {
+    if (build_strategy.num_trainers_ > 1) {
+      // 1. num_tariners would be grater than 1 for nccl distributed training.
+      return true;
+    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
+      // 2. Only one trainer process, but ParallelExecutor hold multiple
+      // devices.
+      return true;
+    }
+    return false;
+  };
+
+  if (need_broadcast()) {
+    BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
  }
-  // Startup Program has been run. All local scopes has correct parameters.

-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::unique_ptr<ir::Graph> graph;
+// Startup Program has been run. All local scopes has correct parameters.
+
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                               member_->local_scopes_, member_->nranks_,
-                               member_->use_cuda_, member_->nccl_ctxs_.get());
+
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_,
+      member_->nccl_ctxs_.get());
 #else
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                               member_->local_scopes_, member_->nranks_,
-                               member_->use_cuda_);
+  temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graph), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_);
+
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
  VLOG(10) << "Eager Deletion Threshold "
           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    graph = member_
+                ->PrepareGCAndRefCnts(std::move(temp_owned_graph),
+                                      static_cast<size_t>(max_memory_size))
+                .release();
+  } else {
+    graph = temp_owned_graph.release();
  }

  // Step 3. Create vars in each scope. Passes may also create new vars.
@@ -308,8 +348,7 @@ ParallelExecutor::ParallelExecutor(
    // TODO(Yancey1989): Remove passing in the main_program when
    // allreduce_seq_pass doesn't need it as the attr.
    member_->executor_.reset(new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_, main_program,
-        std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_, graph));
 #else
    PADDLE_THROW(
        "Paddle should be compiled with CUDA for ParallelGraph Execution.");
@@ -317,12 +356,10 @@ ParallelExecutor::ParallelExecutor(
  } else {
    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
    } else {
      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
    }
  }

@@ -332,7 +369,7 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToDevices(
-    const std::unordered_set<std::string> &vars) const {
+    const std::vector<std::string> &vars, int trainer_id) const {
  // the initializing bcast, all vars would be bcast from device(0).
  for (auto &var : vars) {
    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
@@ -356,7 +393,7 @@ void ParallelExecutor::BCastParamsToDevices(
        auto place = member_->places_[i];
        void *buffer;

-        if (i == 0) {
+        if (i == 0 && trainer_id == 0) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
@@ -452,24 +489,33 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
  }
 }

+ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  delete member_;
+}
+
 bool ParallelExecutor::EnableParallelGraphExecution(
-    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
    const BuildStrategy &build_strategy) const {
  if (!FLAGS_enable_parallel_graph) return false;

  bool enable_parallel_graph = true;
-  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-  for (auto &var_desc : main_program.Block(0).AllVars()) {
-    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
-      enable_parallel_graph = false;
-    }
-  }

-  // TODO(Yancey1989): support pserver mode
-  for (auto &op_desc : main_program.Block(0).AllOps()) {
-    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
-      enable_parallel_graph = false;
-      break;
+  for (ir::Node *node : graph.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
+        enable_parallel_graph = false;
+        break;
+      }
+    } else if (node->IsOp() && node->Op()) {
+      // TODO(Yancey1989): support pserver mode
+      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
+        enable_parallel_graph = false;
+        break;
+      }
    }
  }

@@ -481,13 +527,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
  return enable_parallel_graph;
 }

-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
 }  // namespace framework
 }  // namespace paddle


--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/python_headers.h
+++ b/paddle/fluid/framework/python_headers.h
@@ -24,3 +24,11 @@ limitations under the License. */

 #pragma pop_macro("_XOPEN_SOURCE")
 #pragma pop_macro("_POSIX_C_SOURCE")
+
+#if !defined(PYBIND11_HIDDEN)
+#ifdef _WIN32
+#define PYBIND11_HIDDEN __declspec(dllexport)
+#else
+#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
+#endif
+#endif
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -50,8 +50,6 @@ class Scope;
 }  // namespace framework

 namespace operators {
-template <typename T>
-class AlgorithmsCache;

 class CudnnRNNCache;

@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #ifndef _WIN32
    ncclUniqueId, platform::Communicator,
 #endif
-    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
-    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
    operators::CudnnRNNCache,
 #endif
    int, float>;

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(utils)
 if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()
+# add_subdirectory(anakin)

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+cc_library(anakin_op_converter SRCS fc.cc registrar.cc DEPS anakin_engine framework_proto scope)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
--- a/paddle/fluid/inference/anakin/convert/registrar.h
+++ b/paddle/fluid/inference/anakin/convert/registrar.h
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/alloc_continuous_space_op.cc
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ b/paddle/fluid/operators/is_empty_op.cu.cc
--- a/paddle/fluid/operators/is_empty_op.h
+++ b/paddle/fluid/operators/is_empty_op.h
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
--- a/paddle/fluid/operators/jit/gen/sgd.h
+++ b/paddle/fluid/operators/jit/gen/sgd.h
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
--- a/paddle/fluid/operators/ngraph/ops/accuracy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/accuracy_op.h
--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
--- a/paddle/fluid/operators/ngraph/ops/adam_op.h
+++ b/paddle/fluid/operators/ngraph/ops/adam_op.h
--- a/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
+++ b/paddle/fluid/operators/ngraph/ops/batch_norm_op.h
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
--- a/paddle/fluid/operators/ngraph/ops/concat_op.h
+++ b/paddle/fluid/operators/ngraph/ops/concat_op.h
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
--- a/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_add_op.h
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
--- a/paddle/fluid/operators/ngraph/ops/mean_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
--- a/paddle/fluid/operators/ngraph/ops/momentum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/momentum_op.h
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
--- a/paddle/fluid/operators/ngraph/ops/scale_op.h
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
--- a/paddle/fluid/operators/ngraph/ops/softmax_op.h
+++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h
--- a/paddle/fluid/operators/ngraph/ops/sum_op.h
+++ b/paddle/fluid/operators/ngraph/ops/sum_op.h
--- a/paddle/fluid/operators/ngraph/ops/top_k_op.h
+++ b/paddle/fluid/operators/ngraph/ops/top_k_op.h
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
--- a/paddle/fluid/operators/spectral_norm_op.cu
+++ b/paddle/fluid/operators/spectral_norm_op.cu
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/cudnn_desc_test.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/mkldnn_utils.h
+++ b/paddle/fluid/platform/mkldnn_utils.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/pybind_boost_headers.h
+++ b/paddle/fluid/pybind/pybind_boost_headers.h
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
+++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
--- a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
--- a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/utils/plot.py
+++ b/python/paddle/utils/plot.py
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
--- a/tools/check_doc_approval.py
+++ b/tools/check_doc_approval.py
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
--- a/tools/timeline.py
+++ b/tools/timeline.py