Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into change_requirements

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into change_requirements
test=develop
1b372d9b · minqiyang · 53760bb1 · 1d9b2a45 · 1b372d9b · 1b372d9b
422 changed file
--- a/.gitignore
+++ b/.gitignore
+python/paddle/fluid/tests/unittests/reader_reset_test.recordio
 paddle/operators/check_t.save
 paddle/operators/check_tensor.ls
 paddle/operators/tensor.save
 python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
+paddle/fluid/operators/distributed/send_recv.proto
 *.DS_Store
 *.vs
 build/
@@ -28,4 +30,5 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
+paddle/fluid/operators/distributed/send_recv.proto
 model_test
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -42,6 +42,7 @@
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
+| Sand3r- | Michal Gallus |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)
@@ -312,11 +320,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
-
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -166,8 +166,8 @@ copy(framework_lib DEPS ${framework_lib_deps}

 set(module "memory")
 copy(memory_lib
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
        )

 set(inference_deps paddle_fluid_shared paddle_fluid)

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
+set(PART_CUDA_KERNEL_FILES)
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
+    set(cu_cc_srcs)
+    set(cudnn_cu_cc_srcs)
+    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
+    set(op_common_deps operator op_registry math_function)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
+        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
+            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
+        endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
+                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+    if (WIN32)
+    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
+     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
+      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
+
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+            ${op_common_deps})
+    endif()
+
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
+"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
+    endif()
+
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
+    endif()
+
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+      else()
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
+    endif()
+endfunction()
+
+
+function(register_operators)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+    string(REPLACE "_mkldnn" "" OPS "${OPS}")
+    string(REPLACE ".cc" "" OPS "${OPS}")
+    list(REMOVE_DUPLICATES OPS)
+    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
+
+    foreach(src ${OPS})
+        list(FIND register_operators_EXCLUDES ${src} _index)
+        if (${_index} EQUAL -1)
+            if (${register_operators_DEPS_len} GREATER 0)
+                op_library(${src} DEPS ${register_operators_DEPS})
+            else()
+                op_library(${src})
+            endif()
+        endif()
+    endforeach()
+endfunction()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -93,11 +93,11 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
 paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
 paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
 paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
+paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"

--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -30,6 +30,8 @@ class ExceptionHolder {
      Catch(exp);
    } catch (platform::EnforceNotMet exp) {
      Catch(exp);
+    } catch (std::exception& ex) {
+      LOG(FATAL) << "std::exception caught, " << ex.what();
    } catch (...) {
      LOG(FATAL) << "Unknown exception caught";
    }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                          &(ctx->cur_ref_cnts_));
    }
-
-    if (FLAGS_benchmark) {
-      VLOG(20) << "Memory used after operator " + op->Type() + " running: "
-               << memory::memory_usage(place_);
-    }
  }

  if (gc != nullptr) {
@@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      scope->DropKids();
    }
  }
-
-  if (FLAGS_benchmark) {
-    VLOG(20) << "-------------------------------------------------------";
-    VLOG(20) << "Memory used after deleting local scope: "
-             << memory::memory_usage(place_);
-    VLOG(20) << "-------------------------------------------------------";
-  }
 }

 void Executor::RunPreparedContext(

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
 pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
+pass_library(is_test_pass base)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)
@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)

--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -14,14 +14,15 @@

 #include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include <functional>
-#include <utility>
+#include <list>
+#include <map>
+#include <tuple>

 #include "paddle/fluid/framework/ir/graph_traits.h"

 namespace paddle {
 namespace framework {
 namespace ir {
-namespace {

 // The function keeps the graph consistent by replacing
 // a node 'from' in the set of inputs nodes
@@ -51,99 +52,382 @@ void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
    }
  }
 }
-}  // namespace
-using graph_ptr = std::unique_ptr<ir::Graph>;

-graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
-  FusePassBase::Init(name_scope_, graph.get());
+bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
+  auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
+    for (auto n : graph->Nodes()) {
+      if (n == node) {
+        return n;
+      }
+    }

-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
+    return nullptr;
+  };

-  patterns::Conv conv_pattern{pattern, name_scope_};
-  auto conv_output = conv_pattern();
+  if (from == to) {
+    return true;
+  }

-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
-  elementwise_add_pattern(conv_output);
+  std::map<Node*, bool> visited;

-  conv_output->AsIntermediate();
+  for (auto& node : GraphTraits::DFS(*graph)) {
+    visited[&node] = false;
+  }

-  auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> {
-    auto bias_input_names = conv_op.Op()->Inputs();
-    auto bias_it = bias_input_names.find("Bias");
-
-    if (bias_it != std::end(bias_input_names)) {
-      bool has_bias = !bias_it->second.empty();
-
-      if (has_bias) {
-        auto conv_bias_names = bias_it->second;
-        auto conv_bias_names_it =
-            std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
-                         [&conv_bias_names](Node* n) -> bool {
-                           return n->Name() == conv_bias_names[0];
-                         });
-        return std::make_pair(has_bias, *conv_bias_names_it);
-      }
-    }
+  visited[from] = true;

-    return std::make_pair(false, nullptr);
-  };
+  std::list<Node*> queue;
+  queue.push_back(from);

-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+  while (!queue.empty()) {
+    auto cur = find_node(graph, queue.front());
+    queue.pop_front();

-    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (!cur) return false;

-    OpDesc op_desc;
-    op_desc.SetType("conv2d");
+    for (auto n : cur->outputs) {
+      if (n == to) {
+        return true;
+      }

-    op_desc.SetInput("Input", {conv_input->Name()});
-    op_desc.SetInput("Filter", {conv_filter->Name()});
-    op_desc.SetInput("ResidualData", {elementwise_add_x->Name()});
-    op_desc.SetOutput("Output", {conv_output->Name()});
+      if (!visited[n]) {
+        visited[n] = true;
+        queue.push_back(n);
+      }
+    }
+  }
+  return false;
+}

-    bool has_bias;
-    Node* conv_bias;
+boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) {
+  auto bias_input_names = op.Op()->Inputs();
+  auto bias_it = bias_input_names.find(bias_name);

-    std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op);
+  if (bias_it != std::end(bias_input_names)) {
+    bool has_bias = !bias_it->second.empty();

    if (has_bias) {
-      op_desc.SetInput("Bias", {conv_bias->Name()});
+      auto bias_names = bias_it->second;
+      auto bias_names_it =
+          std::find_if(std::begin(op.inputs), std::end(op.inputs),
+                       [&bias_names](Node* n) -> bool {
+                         return n->Name() == bias_names[0];
+                       });
+      return *bias_names_it;
    }
+  }

-    for (const auto& attr : conv_op->Op()->GetAttrMap()) {
-      op_desc.SetAttr(attr.first, attr.second);
-    }
+  return boost::none;
+}

-    op_desc.SetAttr("fuse_residual_connection", true);
+ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
+    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
+    const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
+        get_node_from_conv_op,
+    const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
+        get_node_from_elementwise_add_op)
+    : fusion_stats{std::make_shared<int>(0)},
+      can_fuse_func{can_fuse_func},
+      get_node_from_conv_op{get_node_from_conv_op},
+      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
+
+void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
+    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+  Node* conv_op;
+  Node* conv_input;
+  Node* conv_filter;
+  Node* conv_output;
+
+  Node* elementwise_add_op;
+  Node* elementwise_add_identity;
+  Node* elementwise_add_out;
+
+  std::tie(conv_op, conv_input, conv_filter, conv_output) =
+      get_node_from_conv_op(subgraph);
+  std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
+      get_node_from_elementwise_add_op(subgraph);
+
+  if (!can_fuse_func(conv_op, elementwise_add_op)) return;
+
+  if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
+
+  OpDesc op_desc;
+  op_desc.SetType("conv2d");
+
+  op_desc.SetInput("Input", {conv_input->Name()});
+  op_desc.SetInput("Filter", {conv_filter->Name()});
+  op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()});
+  op_desc.SetOutput("Output", {conv_output->Name()});
+
+  auto conv_bias = HasBias(*conv_op, "Bias");
+
+  if (conv_bias) {
+    op_desc.SetInput("Bias", {(*conv_bias)->Name()});
+  }

-    auto fused_conv_op = g->CreateOpNode(&op_desc);
+  for (const auto& attr : conv_op->Op()->GetAttrMap()) {
+    op_desc.SetAttr(attr.first, attr.second);
+  }

-    IR_NODE_LINK_TO(conv_input, fused_conv_op);
-    IR_NODE_LINK_TO(conv_filter, fused_conv_op);
-    IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op);
-    IR_NODE_LINK_TO(fused_conv_op, conv_output);
+  op_desc.SetAttr("fuse_residual_connection", true);

-    if (has_bias) {
-      IR_NODE_LINK_TO(conv_bias, fused_conv_op);
-    }
+  auto fused_conv_op = graph->CreateOpNode(&op_desc);

-    CorrectGraphEdges(g, elementwise_add_out, conv_output);
-    GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op});
-  };
+  IR_NODE_LINK_TO(conv_input, fused_conv_op);
+  IR_NODE_LINK_TO(conv_filter, fused_conv_op);
+  IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
+  IR_NODE_LINK_TO(fused_conv_op, conv_output);

-  gpd(graph.get(), handler);
+  if (conv_bias) {
+    IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
+  }

+  CorrectGraphEdges(graph, elementwise_add_out, conv_output);
+  GraphSafeRemoveNodes(graph,
+                       {elementwise_add_out, conv_op, elementwise_add_op});
+  (*fusion_stats)++;
+}
+
+ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
+    const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
+    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
+        get_node_from_conv_x_op,
+    const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
+        get_node_from_conv_y_op,
+    const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
+        get_node_from_elementwise_add_op)
+    : fusion_stats{std::make_shared<int>(0)},
+      can_fuse_func{can_fuse_func},
+      get_node_from_conv_x_op{get_node_from_conv_x_op},
+      get_node_from_conv_y_op{get_node_from_conv_y_op},
+      get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
+
+void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
+    const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+  Node* conv_x_op;
+  Node* conv_x_input;
+  Node* conv_x_filter;
+  Node* conv_x_output;
+
+  Node* conv_y_op;
+  Node* conv_y_input;
+  Node* conv_y_filter;
+  Node* conv_y_output;
+
+  Node* elementwise_add_op;
+  Node* elementwise_add_out;
+
+  std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
+      get_node_from_conv_x_op(subgraph);
+  std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
+      get_node_from_conv_y_op(subgraph);
+  std::tie(elementwise_add_op, elementwise_add_out) =
+      get_node_from_elementwise_add_op(subgraph);
+
+  if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
+  if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
+
+  Node* projection_node;
+  Node* residual_conv_op;
+  Node* residual_conv_input;
+  Node* residual_conv_filter;
+  Node* residual_conv_output;
+
+  if (IsReachable(graph, conv_x_input, conv_y_output)) {
+    projection_node = conv_x_output;
+    residual_conv_op = conv_y_op;
+    residual_conv_input = conv_y_input;
+    residual_conv_filter = conv_y_filter;
+    residual_conv_output = conv_y_output;
+  } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
+    projection_node = conv_y_output;
+    residual_conv_op = conv_x_op;
+    residual_conv_input = conv_x_input;
+    residual_conv_filter = conv_x_filter;
+    residual_conv_output = conv_x_output;
+  } else {
+    return;
+  }
+
+  OpDesc op_desc;
+  op_desc.SetType("conv2d");
+
+  op_desc.SetInput("Input", {residual_conv_input->Name()});
+  op_desc.SetInput("Filter", {residual_conv_filter->Name()});
+  op_desc.SetInput("ResidualData", {projection_node->Name()});
+  op_desc.SetOutput("Output", {residual_conv_output->Name()});
+
+  auto residual_conv_bias = HasBias(*residual_conv_op, "Bias");
+
+  if (residual_conv_bias) {
+    op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
+  }
+
+  for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
+    op_desc.SetAttr(attr.first, attr.second);
+  }
+
+  op_desc.SetAttr("fuse_residual_connection", true);
+
+  auto fused_conv_op = graph->CreateOpNode(&op_desc);
+
+  IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
+  IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
+  IR_NODE_LINK_TO(projection_node, fused_conv_op);
+  IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
+
+  if (residual_conv_bias) {
+    IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
+  }
+
+  CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
+  GraphSafeRemoveNodes(
+      graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
+  (*fusion_stats)++;
+}
+
+std::tuple<Node*, Node*, Node*, Node*>
+ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
+    const patterns::Conv& conv_pattern,
+    const GraphPatternDetector::subgraph_t& subgraph) const {
+  GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+  GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+  GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+  GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+  return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
+}
+
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
+    const std::string& name_scope,
+    const GraphWithStats& graph_with_stats) const {
+  ir::Graph* graph;
+  int stats;
+
+  std::tie(graph, stats) = graph_with_stats;
+
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+
+  patterns::Conv conv_pattern{pattern, name_scope};
+  auto conv_output = conv_pattern();
+
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
+  elementwise_add_pattern(
+      conv_output,
+      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  conv_output->AsIntermediate();
+
+  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
+      const GraphPatternDetector::subgraph_t& subgraph)
+      -> std::tuple<Node*, Node*, Node*> {
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                                  elementwise_add_pattern);
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
+                                  elementwise_add_pattern);
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                                  elementwise_add_pattern);
+
+        return std::make_tuple(elementwise_add_op, elementwise_add_y,
+                               elementwise_add_out);
+      };
+
+  return ExecuteHandleOnGraph<IdentityFuseHandle>(
+      &gpd, graph_with_stats,
+      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
+        return GetNodesFromConv(conv_pattern, subgraph);
+      },
+      get_node_from_elementwise_add);
+}
+
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
+    const std::string& name_scope,
+    const GraphWithStats& graph_with_stats) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+
+  patterns::Conv conv_pattern{pattern, name_scope};
+  auto conv_output = conv_pattern();
+
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
+  elementwise_add_pattern(
+      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
+      conv_output);
+  conv_output->AsIntermediate();
+
+  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
+      const GraphPatternDetector::subgraph_t& subgraph)
+      -> std::tuple<Node*, Node*, Node*> {
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                                  elementwise_add_pattern);
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                                  elementwise_add_pattern);
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                                  elementwise_add_pattern);
+
+        return std::make_tuple(elementwise_add_op, elementwise_add_x,
+                               elementwise_add_out);
+      };
+
+  return ExecuteHandleOnGraph<IdentityFuseHandle>(
+      &gpd, graph_with_stats,
+      [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
+        return GetNodesFromConv(conv_pattern, subgraph);
+      },
+      get_node_from_elementwise_add);
+}
+
+GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
+    const std::string& name_scope,
+    const GraphWithStats& graph_with_stats) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+
+  patterns::Conv conv_x_pattern{pattern, name_scope};
+  auto conv_x_output = conv_x_pattern();
+
+  patterns::Conv conv_y_pattern{pattern, name_scope};
+  auto conv_y_output = conv_y_pattern();
+
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
+  elementwise_add_pattern(conv_x_output, conv_y_output);
+  conv_x_output->AsIntermediate();
+  conv_y_output->AsIntermediate();
+
+  auto get_node_from_elementwise_add = [&elementwise_add_pattern](
+      const GraphPatternDetector::subgraph_t& subgraph)
+      -> std::tuple<Node*, Node*> {
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                                  elementwise_add_pattern);
+        GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                                  elementwise_add_pattern);
+
+        return std::make_tuple(elementwise_add_op, elementwise_add_out);
+      };
+
+  return ExecuteHandleOnGraph<ProjectionFuseHandle>(
+      &gpd, graph_with_stats,
+      [this,
+       &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
+        return GetNodesFromConv(conv_x_pattern, subgraph);
+      },
+      [this,
+       &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
+        return GetNodesFromConv(conv_y_pattern, subgraph);
+      },
+      get_node_from_elementwise_add);
+}
+
+graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  auto fused_graph_with_stats = FuseConvAsY(
+      name_scope_,
+      FuseConvAsX(
+          name_scope_,
+          FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0))));
+
+  std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl;
+  AddStatis(fused_graph_with_stats.second);
  return graph;
 }
 }  // namespace ir
@@ -151,4 +435,4 @@ graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 }  // namespace paddle

 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass);
+              paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -15,24 +15,119 @@
 #pragma once

 #include <string>
+#include <tuple>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"

+#include <boost/optional.hpp>
+
 namespace paddle {
 namespace framework {
 namespace ir {

-class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+using graph_ptr = std::unique_ptr<ir::Graph>;
+using GraphWithStats = std::pair<ir::Graph*, int>;
+
+void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
+bool IsReachable(ir::Graph* graph, Node* from, Node* to);
+boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
+
+class ResidualConnectionMKLDNNFusePass : public FusePassBase {
+ private:
+  GraphWithStats FuseConvAsX(const std::string& name_scope,
+                             const GraphWithStats& graph_with_stats) const;
+  GraphWithStats FuseConvAsY(const std::string& name_scope,
+                             const GraphWithStats& graph_with_stats) const;
+  GraphWithStats FuseProjectionConv(
+      const std::string& name_scope,
+      const GraphWithStats& graph_with_stats) const;
+
+  template <typename RetType>
+  using GetNodeFunc =
+      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
+  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
+  using IdentityElementwiseAddFunc =
+      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
+
+  using ProjectionConvFunc = IdentityConvFunc;
+  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
+
+  using CanFuseFunc = std::function<bool(Node*, Node*)>;
+
+  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
+      const patterns::Conv& conv_pattern,
+      const GraphPatternDetector::subgraph_t& subgraph) const;
+
+  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
+      const patterns::Conv& conv_pattern,
+      const GraphPatternDetector::subgraph_t& subgraph) const;
+
+  template <typename HandleType, typename... OpFuncs>
+  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
+                                      const GraphWithStats& graph_with_stats,
+                                      OpFuncs&&... op_funcs) const {
+    ir::Graph* graph;
+    int stats;
+
+    std::tie(graph, stats) = graph_with_stats;
+
+    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
+      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
+    };
+
+    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
+
+    (*gpd)(graph, fuse_handle);
+
+    return std::make_pair(graph, stats + fuse_handle.get_stats());
+  }
+
+  struct IdentityFuseHandle {
+    IdentityFuseHandle(
+        const CanFuseFunc& can_fuse_func,
+        const IdentityConvFunc& get_node_from_conv_op,
+        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
+
+    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* graph);
+    int get_stats() const { return *fusion_stats; }
+
+   private:
+    std::shared_ptr<int> fusion_stats;
+    CanFuseFunc can_fuse_func;
+    IdentityConvFunc get_node_from_conv_op;
+    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
+  };
+
+  struct ProjectionFuseHandle {
+    ProjectionFuseHandle(
+        const CanFuseFunc& can_fuse_func,
+        const ProjectionConvFunc& get_node_from_conv_x_op,
+        const ProjectionConvFunc& get_node_from_conv_y_op,
+        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
+
+    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* graph);
+    int get_stats() const { return *fusion_stats; }
+
+   private:
+    std::shared_ptr<int> fusion_stats;
+    CanFuseFunc can_fuse_func;
+    ProjectionConvFunc get_node_from_conv_x_op;
+    ProjectionConvFunc get_node_from_conv_y_op;
+    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
+  };
+
 public:
-  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+  virtual ~ResidualConnectionMKLDNNFusePass() {}

 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;

-  const std::string name_scope_{"residual_connections_fuse_pass"};
+  const std::string name_scope_{"residual_connection_fuse_pass"};
 };
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
  op->SetOutput(output.first, {output.second});
 }

-struct IsReachable {
+struct TestIsReachable {
  using func = std::function<bool(const std::string&, const std::string&)>;

  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
@@ -89,7 +89,9 @@ struct IsReachable {
  }
 };

-void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
+                    int expected_conv_count,
+                    int expected_elementwise_add_count = 0) {
  int conv_count = 0;
  int elementwise_add_count = 0;

@@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
      ++elementwise_add_count;
    }
  }
-  EXPECT_EQ(conv_count, 1);
-  EXPECT_EQ(elementwise_add_count, 0);
+  EXPECT_EQ(conv_count, expected_conv_count);
+  EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
 }

 ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
@@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,

  return prog;
 }
-}  // namespace
-
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
-  auto prog =
-      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
-
-  SetOp(&prog, "conv2d",
-        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "b"});
-  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});

-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
+                      const std::string& to, int expected_conv_num) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));

-  IsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  TestIsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)(from, to));

  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();

-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_TRUE(is_reachable(graph)(from, to));

  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
            current_nodes_num);

-  AssertOpsCount(graph);
+  AssertOpsCount(graph, expected_conv_num);
 }
+}  // namespace

-TEST(ConvElementwiseAddMKLDNNFusePass,
-     ConvolutionWithElementwiseAddReluNoBias) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
-        {"Output", "b"});
-  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
-  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});

-  IsReachable is_reachable;
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d",
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});

-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});

-  auto pass =
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
-  int original_nodes_num = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
-  int current_nodes_num = graph->Nodes().size();
+  RunPassAndAssert(&prog, "a", "relu", 1);
+}

-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionAsYWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});

-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
-            current_nodes_num);
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
+        {"Output", "c"});
+  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});

-  AssertOpsCount(graph);
+  RunPassAndAssert(&prog, "a", "relu", 1);
 }

-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
+
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d",
-        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "b"});
-  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {"Output", "c"});

-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});

-  IsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+  RunPassAndAssert(&prog, "a", "relu", 1);
+}

-  auto pass =
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
-  int original_nodes_num = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
-  int current_nodes_num = graph->Nodes().size();
+TEST(ConvElementwiseAddMKLDNNFusePass,
+     ConvolutionAsXWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});

-  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
+        {"Output", "c"});
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
+  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});

-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
-            current_nodes_num);
-  AssertOpsCount(graph);
+  RunPassAndAssert(&prog, "a", "relu", 1);
 }

-TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
  auto prog =
-      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
+
  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d",
-        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {"Output", "c"});
-  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
-  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});

-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
+        {"Output", "e"});

-  IsReachable is_reachable;
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
+  SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});

-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  TestIsReachable is_reachable;
+  EXPECT_TRUE(is_reachable(graph)("a", "g"));

  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();

-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  EXPECT_TRUE(is_reachable(graph)("a", "g"));
+  EXPECT_EQ(original_nodes_num, current_nodes_num);

-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
-            current_nodes_num);
-  AssertOpsCount(graph);
+  AssertOpsCount(graph, 2, 1);
 }

 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -15,8 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
 #include <unordered_set>

+DEFINE_string(print_sub_graph_dir, "",
+              "FLAGS_print_sub_graph_dir is used "
+              "to print the nodes of sub_graphs.");
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
    graph_nodes.emplace_back(g_nodes);
  }

-  if (VLOG_IS_ON(100)) {
-    VLOG(100) << "graph_num: " << graph_nodes.size();
-    for (auto &g_n : graph_nodes) {
-      VLOG(100) << "graph_nodes: " << g_n.size();
-      if (g_n.size() < 10) {
-        std::stringstream out;
+  if (FLAGS_print_sub_graph_dir.size()) {
+    if (graph_nodes.size() > 1) {
+      std::stringstream out;
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size() << "\n";
+      }
+      out << "\n\n";
+      for (auto &g_n : graph_nodes) {
+        out << "graph_nodes: " << g_n.size();
        for (auto &node : g_n) {
          out << "\nNode: " << node->Name() << " in [";
          for (auto &n : node->inputs) {
@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
          }
          out << "]";
        }
-        VLOG(100) << out.str();
+        out << "\n\n\n";
      }
+      std::unique_ptr<std::ostream> fout(
+          new std::ofstream(FLAGS_print_sub_graph_dir));
+      PADDLE_ENFORCE(fout->good());
+      *fout << out.str();
    }
  }


--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
  return output_var;
 }

-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
                                ->assert_is_op("elementwise_add");

-  x_var->assert_is_op_input("elementwise_add", "X");
-
-  auto y_var = pattern->NewNode(elementwise_add_x_repr())
-                   ->AsInput()
-                   ->assert_is_op_input("elementwise_add", "Y");
-
+  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
+  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
  auto out_var = pattern->NewNode(elementwise_add_out_repr())
                     ->AsOutput()
                     ->assert_is_op_output("elementwise_add", "Out");

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "elementwise_add") {}

-  PDNode* operator()(PDNode* x_var);
+  PDNode* operator()(PDNode* x_var, PDNode* y_var);

  PATTERN_DECL_NODE(elementwise_add_op);
  PATTERN_DECL_NODE(elementwise_add_x);

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
+             "for activations and pooling.";
+  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
+                  "softshrink",  "exp",          "brelu",
+                  "pow",         "leaky_relu",   "stanh",
+                  "relu",        "tanh",         "tanh_shrink",
+                  "sqrt",        "abs",          "ceil",
+                  "elu",         "floor",        "cos",
+                  "sin",         "round",        "reciprocal",
+                  "hard_shrink", "hard_sigmoid", "relu6",
+                  "soft_relu",   "swish",        "thresholded_relu",
+                  "log",         "square",       "softplus",
+                  "softsign"};
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("is_test")) {
+        op->SetAttr("is_test", true);
+      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
+                 end(op_list)) {
+        op->MutableAttrMap()->insert(
+            std::pair<std::string, Attribute>("is_test", true));
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IsTestPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/is_test_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+enum class ISTEST_STATE { FALSE, TRUE, UNSET };
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false,
+           ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("name", name);
+  op->SetInput("X", inputs);
+  op->SetOutput("Out", outputs);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  if (is_test == ISTEST_STATE::UNSET)
+    op->MutableAttrMap()->erase("is_test");
+  else if (is_test == ISTEST_STATE::FALSE)
+    op->SetAttr("is_test", false);
+  else
+    op->SetAttr("is_test", true);
+}
+
+// a->pool2d->b
+// b->relu->c
+// c,weights1)->conv2d->d
+//
+// d->pool2d->e
+// e->hard_sigmoid->f
+// (f,weights2)->conv2d->g
+//
+// g->pool2d->h
+// h->tanh->i
+// (i,weights3)->conv2d->j
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
+                                 "j", "weights1", "weights2", "weights3"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights1" || v == "weights2" || v == "weights3") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
+  SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
+        std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
+
+  SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
+        std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
+        std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
+  SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
+        std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
+
+  SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
+        std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
+  SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
+        std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
+  SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
+        std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
+
+  return prog;
+}
+
+TEST(IsTestPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("is_test_pass");
+
+  graph = pass->Apply(std::move(graph));
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      auto op_name = boost::get<std::string>(op->GetAttr("name"));
+      if (op_name == "conv3") {
+        ASSERT_FALSE(op->HasAttr("is_test"));
+      } else {
+        ASSERT_TRUE(op->HasAttr("is_test"));
+        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
+      }
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(is_test_pass);
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
 public:
  LoDTensor() : Tensor() {}

-  /* Constructor with place should only be used in pybind */
-  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
-
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}

  void set_lod(const LoD& lod) { lod_ = lod; }

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"

 #include "glog/logging.h"
@@ -31,46 +32,6 @@ namespace paddle {
 namespace framework {

 #if defined(PADDLE_WITH_CUDA)
-namespace details {
-struct CUDABuffer {
-  void *data_{nullptr};
-  size_t size_{0};
-  platform::CUDAPlace place_;
-
-  CUDABuffer() {}
-  CUDABuffer(platform::Place place, size_t size)
-      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-    data_ = memory::Alloc(place_, size);
-  }
-
-  ~CUDABuffer() { ClearMemory(); }
-
-  CUDABuffer(const CUDABuffer &o) = delete;
-  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-
-  void Resize(platform::Place place, size_t size) {
-    ClearMemory();
-    place_ = boost::get<platform::CUDAPlace>(place);
-    data_ = memory::Alloc(place_, size);
-    PADDLE_ENFORCE_NOT_NULL(data_);
-    size_ = size;
-  }
-
-  void Swap(CUDABuffer &o) {
-    std::swap(data_, o.data_);
-    std::swap(place_, o.place_);
-    std::swap(size_, o.size_);
-  }
-
- private:
-  void ClearMemory() const {
-    if (data_ != nullptr) {
-      memory::Free(place_, data_);
-    }
-  }
-};
-}  // namespace details
-
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -103,8 +64,6 @@ class Vector {
      o.ImmutableCPU();
      cpu_ = o.cpu_;
      flag_ = kDataInCPU;
-      details::CUDABuffer null;
-      gpu_.Swap(null);
      return *this;
    }

@@ -199,7 +158,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
+      return reinterpret_cast<T *>(gpu_->ptr());
    }

    // get cuda ptr. mutable
@@ -234,13 +193,11 @@ class Vector {

    std::mutex &Mutex() const { return mtx_; }

-    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
-      if (gpu_.data_ == nullptr) {
-        return nullptr;
-      } else {
-        return std::unique_ptr<platform::CUDAPlace>(
-            new platform::CUDAPlace(gpu_.place_));
-      }
+    boost::optional<platform::CUDAPlace> CUDAPlace() const {
+      return gpu_ == nullptr
+                 ? boost::none
+                 : boost::optional<platform::CUDAPlace>(
+                       boost::get<platform::CUDAPlace>(gpu_->place()));
    }

   private:
@@ -254,13 +211,12 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(
-              platform::Place(gpu_.place_)));
+          platform::DeviceContextPool::Instance().Get(gpu_->place()));
      auto stream = dev_ctx->stream();
-      void *src = gpu_.data_;
+      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
-                   stream);
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                   gpu_->size(), stream);
      dev_ctx->Wait();
    }

@@ -277,8 +233,7 @@ class Vector {
          CopyCPUDataToCUDA(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
-                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
          PADDLE_THROW("This situation should not happen");
          // Still dirty
        } else {
@@ -290,7 +245,7 @@ class Vector {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
          CopyCPUDataToCUDA(place);
          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (!(place == gpu_->place())) {
          PADDLE_THROW("This situation should not happen.");
        } else {
          // Not Dirty && DataInCUDA && Device is same
@@ -301,13 +256,13 @@ class Vector {

    void CopyCPUDataToCUDA(const platform::Place &place) const {
      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
+      gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_->ptr();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
-                   stream);
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                   gpu_->size(), stream);
    }

    void ImmutableCPU() const {
@@ -329,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }

    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
+    mutable memory::AllocationPtr gpu_;
    mutable int flag_;

    mutable std::mutex mtx_;
@@ -428,8 +383,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+      if (cuda_place == boost::none ||
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.Data().CUDAData(place);
      }
    }
@@ -444,8 +399,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+      if (cuda_place == boost::none ||
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.MutableData()->CUDAMutableData(place);
      }
    }

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }

 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
            << " on scope " << scope_;

--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
        place_(dev_ctx.GetPlace()),
        library_type_(library_type) {}

+  size_t hash_key() const { return Hash()(*this); }
+
  bool operator==(const OpKernelType& o) const {
    return platform::places_are_same_class(place_, o.place_) &&
           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {

+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
+
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
      VLOG(30) << "Transform Variable " << var_name << " from "
               << kernel_type_for_var << " to " << expected_kernel_key;

+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
+
      if (new_scope == nullptr) {
        new_scope = &scope.NewScope();
      }

      auto* trans_var = new_scope->Var(var_name);
+
      Tensor out;
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
      SetTensorToVariable(*var, out, trans_var);

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -100,6 +100,7 @@ class OperatorBase {

  const std::string& Type() const { return type_; }

+  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
  template <typename T>
  inline const T& Attr(const std::string& name) const {
    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
  }
  // If the loss_var_name is given, the number of graph should be only one.
  if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                      "The number of graph should be only one");
+    size_t graph_num = ir::GraphNum(*graph);
+    if (graph_num > 1) {
+      LOG(WARNING)
+          << "The number of graph should be only one, "
+             "but the current graph has "
+          << ir::GraphNum(*graph)
+          << " sub_graphs. If you want to see the nodes of the "
+             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
+             "to specify the output dir. NOTES: if you not do training, "
+             "please don't pass loss_var_name.";
+    }
  }

  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
 }

 void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           memory::Allocator::Attr attr,
                           size_t requested_size) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
+  type_ = type;
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
+    holder_ = memory::AllocShared(place, size, attr);
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }

-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
+                           size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type(), requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }

 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
+    dst.type_ = type_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -67,12 +67,7 @@ class Tensor {
  friend struct EigenVector;

 public:
-  Tensor() : offset_(0) {}
-
-  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place) : offset_(0) {
-    holder_->set_place(place);
-  }
+  Tensor() : type_(typeid(float)), offset_(0) {}

  /*! Return a pointer to mutable memory block. */
  template <typename T>
@@ -89,12 +84,17 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);

  void* mutable_data(platform::Place place, std::type_index type,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);

-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                     size_t requested_size = 0);

  /**
   * @brief     Return a pointer to mutable memory block.
@@ -106,7 +106,9 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);

  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
@@ -139,7 +141,7 @@ class Tensor {
  std::type_index type() const {
    PADDLE_ENFORCE_NOT_NULL(
        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
+    return type_;
  }

  // memory size returns the holding memory size in byte.
@@ -153,56 +155,13 @@ class Tensor {

  void clear() { holder_ = nullptr; }

- private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-    virtual void* ptr() const = 0;
-    virtual size_t size() const = 0;
-    virtual std::type_index type() const = 0;
-    virtual platform::Place place() const = 0;
-    virtual void set_type(std::type_index type) = 0;
-    virtual void set_place(platform::Place place) = 0;
-  };
-
-  template <typename Place>
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-               memory::PODDeleter<uint8_t, Place>(place)),
-          place_(place),
-          size_(size),
-          type_(type) {
-      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
-                              (is_cpu_place(place_) ? "CPU" : "GPU"));
-    }
-
-    virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return type_; }
-    virtual void set_type(std::type_index type) { type_ = type; }
-    virtual void set_place(platform::Place place) { place_ = place; }
-
-    /*! the pointer of memory block. */
-    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
-
-    /*! the place of memory block. */
-    platform::Place place_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    /* the current type of memory */
-    std::type_index type_;
-  };
+  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
+  size_t offset() const { return offset_; }

+ private:
  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
+  std::shared_ptr<memory::Allocation> holder_;
+  std::type_index type_;
  /**
   * @brief points to elements dimensions.
   *

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -23,10 +23,10 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
-               holder_->type() == std::type_index(typeid(T));
+  bool valid =
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());

  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
-               holder_->type() == std::type_index(typeid(T));
+  bool valid =
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
 }

 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }

 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
+                               memory::Allocator::Attr attr,
+                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
+  return reinterpret_cast<T*>(
+      mutable_data(place, typeid(T), attr, requested_size));
 }

 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
    TensorToStream(oss, gpu_tensor, gpu_ctx);

    std::istringstream iss(oss.str());
-    TensorFromStream(iss, &dst_tensor, gpu_ctx);
+    TensorFromStream(
+        iss, &dst_tensor,
+        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));

    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -13,7 +13,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -7,16 +7,17 @@ set(analysis_deps # analysis_deps can be extended accross the project
 add_subdirectory(ir_passes)
 add_subdirectory(passes)

-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
+cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api)
+
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper)

 cc_library(argument SRCS argument.cc DEPS scope proto_desc)
 cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)

 cc_library(analysis SRCS
  analyzer.cc
-  helper.cc
  analysis_pass
-  DEPS ${analysis_deps}
+  DEPS ${analysis_deps} analysis_helper
  )

 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -30,6 +30,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
  Argument argument;
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);

  Analyzer analyser;
  analyser.Run(&argument);
@@ -41,6 +42,7 @@ TEST(Analyzer, analysis_with_tensorrt) {
  argument.SetTensorRtWorkspaceSize(1 << 20);
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);

  Analyzer analyser;
  analyser.Run(&argument);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -116,6 +116,7 @@ struct Argument {
                      std::vector<std::string>);

  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
                      std::function<bool(const framework::ir::Node*)>);

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -4,4 +4,6 @@ set(analysis_deps ${analysis_deps}
        subgraph_detector tensorrt_subgraph_pass
        CACHE INTERNAL "")

+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
 set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // it is either an OP's input or an OP's output.

  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
    auto correspond_node = subgraph_nodes[index];
    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split"});
+           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+           "conv2d_transpose", "leaky_relu"});
      if (!node->IsOp()) return false;

      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -30,15 +30,28 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
  if (!argument->scope_valid()) {
    argument->SetScope(new framework::Scope);
  }
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  // The load program should run on the same device with the inference program,
+  // so that the parameters will on the same device, or they will keep copying
+  // between difference devices.
+  platform::Place place;
+  if (argument->use_gpu()) {
+    PADDLE_ENFORCE(argument->gpu_device_id_valid());
+    place = platform::CUDAPlace(argument->gpu_device_id());
+  } else {
+    place = platform::CPUPlace();
+  }

  if (argument->model_dir_valid()) {
-    auto program = LoadModel(argument->model_dir(), argument->scope_ptr());
+    auto program =
+        LoadModel(argument->model_dir(), argument->scope_ptr(), place);
    argument->SetMainProgram(program.release());
  } else if (argument->model_program_path_valid() &&
             argument->model_params_path_valid()) {
    auto program =
        LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr());
+                  argument->scope_ptr(), place);
    argument->SetMainProgram(program.release());
  } else {
    PADDLE_THROW(
@@ -52,16 +65,15 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
 }

 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
-    const std::string &path, framework::Scope *scope) {
-  platform::CPUPlace place;
+    const std::string &path, framework::Scope *scope,
+    const platform::Place &place) {
  framework::Executor exe(place);
  return Load(&exe, scope, path);
 }

 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
    const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope) {
-  platform::CPUPlace place;
+    framework::Scope *scope, const platform::Place &place) {
  framework::Executor exe(place);
  return Load(&exe, scope, program_path, params_path);
 }

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace inference {
@@ -32,11 +33,12 @@ class IrGraphBuildPass : public AnalysisPass {
  std::string repr() const override;

 private:
-  std::unique_ptr<framework::ProgramDesc> LoadModel(const std::string &path,
-                                                    framework::Scope *scope);
+  std::unique_ptr<framework::ProgramDesc> LoadModel(
+      const std::string &path, framework::Scope *scope,
+      const platform::Place &place);
  std::unique_ptr<framework::ProgramDesc> LoadModel(
      const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope);
+      framework::Scope *scope, const platform::Place &place);

  std::string model_binary_str_;
 };

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,11 +27,10 @@ endif()
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
-cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
-cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
-
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)

 cc_test(test_paddle_inference_api
        SRCS api_tester.cc

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -285,6 +285,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  status_program_optimized_ = true;

  argument_.SetUseGPU(config_.use_gpu);
+  argument_.SetGPUDeviceId(config_.device);
  // Analyze inference_program
  if (!config_.model_dir.empty()) {
    argument_.SetModelDir(config_.model_dir);
@@ -491,8 +492,7 @@ bool AnalysisPredictor::LoadParameters() {
  }

  // Use NaiveExecutor to Load parameters.
-  platform::CPUPlace place;
-  framework::NaiveExecutor e(place);
+  framework::NaiveExecutor e(place_);
  e.Prepare(scope_.get(), *load_program, 0, false);
  e.Run();
  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
@@ -549,4 +549,7 @@ USE_TRT_CONVERTER(concat);
 USE_TRT_CONVERTER(dropout);
 USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
+USE_TRT_CONVERTER(prelu);
+USE_TRT_CONVERTER(conv2d_transpose);
+USE_TRT_CONVERTER(leaky_relu);
 #endif
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -86,6 +86,7 @@ class CpuPassStrategy : public PassStrategy {
        "fc_fuse_pass",                  //
        "conv_bn_fuse_pass",             //
        "conv_eltwiseadd_bn_fuse_pass",  //
+        "is_test_pass",                  //
    });
  }

@@ -115,8 +116,12 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
+    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
+    // When running in GPU mode, the parameters are all on GPU. But the
+    // opearations of "conv_bn_fuse_pass" are on CPU.
    passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",
+        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
    });
  }


--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
+           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)

 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             elementwise_add_op elementwise_mul_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op SERIAL)
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
-split_op concat_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             split_op concat_op SERIAL)
+nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+        prelu_op SERIAL)
+nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,92 +18,139 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-bool to_skip_merging_optimize(TensorRTEngine* engine_,
+bool to_skip_merging_optimize(TensorRTEngine* engine,
                              const std::vector<int>& filters,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              std::string input_name) {
-  if (engine_->itensor_quote_num[input_name] > 0) {
+  if (engine->itensor_quote_num[input_name] > 0) {
    return true;
  }
  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine_->itensor_quote_num[input_name] += 1;
+    engine->itensor_quote_num[input_name] += 1;

  return false;
 }

+template <typename RegistFunc, typename SetDilationFunc>
+void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
+                   const framework::Scope& scope, bool test_mode,
+                   RegistFunc fadd_layer, SetDilationFunc fset_dilation,
+                   const std::string& name) {
+  VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
+
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+  PADDLE_ENFORCE(engine != nullptr);
+  auto* X = engine->GetITensor(op_desc.Input("Input").front());
+
+  // Declare weights
+  auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+  PADDLE_ENFORCE_NOT_NULL(Y_v);
+  auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+
+  platform::CPUPlace cpu_place;
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(Y_t->dims());
+  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+
+  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  const int n_output = weight_tensor->dims()[0];
+  const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
+  const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+  const std::vector<int> dilations =
+      boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+  const std::vector<int> strides =
+      boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+  const std::vector<int> paddings =
+      boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+  nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+  nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+  nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+  nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                static_cast<void*>(weight_data),
+                                static_cast<size_t>(weight_tensor->numel())};
+
+  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+                           nv_ksize, weight, bias);
+  PADDLE_ENFORCE(layer != nullptr);
+  layer->setStride(nv_strides);
+  layer->setPadding(nv_paddings);
+  layer->setNbGroups(groups);
+  // set dilations
+  fset_dilation(layer, nv_dilations);
+
+  auto output_name = op_desc.Output("Output").front();
+  layer->setName((name + " (Output: " + output_name + ")").c_str());
+  engine->weight_map[op_desc.Input("Filter").front()] =
+      std::move(weight_tensor);
+  layer->getOutput(0)->setName(output_name.c_str());
+  engine->SetITensor(output_name, layer->getOutput(0));
+
+  if (test_mode ||
+      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
+                               op_desc.Input("Input").front())) {
+    engine->DeclareOutput(output_name);
+  }
+}
+
 class Conv2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias";
-
-    framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
-
-    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
-
-    // Declare weights
-    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
-    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-
-    platform::CPUPlace cpu_place;
-    std::unique_ptr<framework::LoDTensor> weight_tensor(
-        new framework::LoDTensor());
-    weight_tensor->Resize(Y_t->dims());
-    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
-
-    auto* weight_data =
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
-
-    PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
-    const int n_output = weight_tensor->dims()[0];
-    const int filter_h = weight_tensor->dims()[2];
-    const int filter_w = weight_tensor->dims()[3];
-
-    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
-    const std::vector<int> dilations =
-        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
-    const std::vector<int> strides =
-        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-    const std::vector<int> paddings =
-        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-
-    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
-    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  weight_tensor->memory_size() / sizeof(float)};
-
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
-        nv_ksize, weight.get(), bias.get());
-    PADDLE_ENFORCE(layer != nullptr);
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
-    layer->setDilation(nv_dilations);
-    layer->setNbGroups(groups);
-
-    auto output_name = op_desc.Output("Output").front();
-    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
-    engine_->weight_map[op_desc.Input("Filter").front()] =
-        std::move(weight_tensor);
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
-
-    if (test_mode ||
-        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
-                                 paddings, op_desc.Input("Input").front())) {
-      engine_->DeclareOutput(output_name);
-    }
+    ConvertConv2d(
+        engine_, op, scope, test_mode,
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
+            int n_input,                             /* Conv input maps */
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
+            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
+          auto* layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                   ksize, weight.get(), bias.get());
+          return layer;
+        },
+        [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
+          layer->setDilation(dilations);
+        },
+        "conv2d");
+  }
+};
+
+class Deconv2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    ConvertConv2d(
+        engine_, op, scope, test_mode,
+        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
+            int n_input,                             /* Deconv output maps */
+            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
+            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
+          auto* layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
+                                   ksize, weight.get(), bias.get());
+          return layer;
+        },
+        [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
+          PADDLE_ENFORCE(
+              dilations.d[0] == 1 && dilations.d[1] == 1,
+              "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
+              dilations.d[0], dilations.d[1]);
+        },
+        "conv2d_transpose");
  }
 };

@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter {
 }  // namespace paddle

 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
+REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {

+static bool CheckDims(const nvinfer1::Dims& dims_x,
+                      const nvinfer1::Dims& dims_y) {
+  if (dims_x.nbDims != dims_y.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dims_x.nbDims; i++) {
+    if (dims_x.d[i] != dims_y.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 class ElementwiseWeightOpConverter : public OpConverter {
 public:
  ElementwiseWeightOpConverter() {}
@@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";

    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -34,7 +48,8 @@ class ElementwiseWeightOpConverter : public OpConverter {

    auto* X = engine_->GetITensor(op_desc.Input("X").front());
    nvinfer1::Dims dims_x = X->getDimensions();
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
+                   dims_x.nbDims);

    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
    PADDLE_ENFORCE_NOT_NULL(Y_v);
@@ -105,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
  ElementwiseTensorOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
+    auto op_pair = ops.find(op_type_);
+    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
+
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";

    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -119,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
    nvinfer1::Dims dims_x = X->getDimensions();
    nvinfer1::Dims dims_y = Y->getDimensions();

-    // The two input tensor should have the same dims
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
-    if (dims_x.nbDims == dims_y.nbDims) {
-      for (int i = 0; i < dims_x.nbDims; i++) {
-        if (dims_x.d[i] != dims_y.d[i])
-          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-      }
-    } else {
-      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
-    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    auto output_name = op_desc.Output("Out")[0];
+    if (CheckDims(dims_x, dims_y)) {
+      // The two input tensor should have the same dims
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";

-    auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW("Wrong elementwise op type!");
-    }
-    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);

-    auto output_name = op_desc.Output("Out")[0];
-    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
-    engine_->SetITensor(output_name, layer->getOutput(0));
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
+                 "ElementWisePluginLayer";
+
+      plugin::ElementWisePlugin* plugin =
+          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+      plugin->AddInput(X);
+      plugin->AddInput(Y);
+      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
+          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
+          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    }
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.
      engine_->DeclareOutput(output_name);

--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// LeakyRelu converter from fluid to tensorRT
+class LeakyReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
+
+    platform::CPUPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor(
+        new framework::LoDTensor());
+    alpha_tensor->Resize(framework::make_ddim({2}));
+    float* alpha_data = alpha_tensor->mutable_data<float>(place);
+    alpha_data[0] = alpha;
+    alpha_data[1] = 1.f - alpha;
+    // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to
+    // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0
+    TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1};
+    TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    // y_scale = alpha * x
+    auto* scale_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+        scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_layer);
+    // y_relu = (x > 0) : x : 0
+    auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input,
+                                            nvinfer1::ActivationType::kRELU);
+    PADDLE_ENFORCE(nullptr != relu_layer);
+    //
+    TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1],
+                                     1};
+    auto* scale_relu_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)),
+                             nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+                             sub_scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_relu_layer);
+    auto* output_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
+                             *(scale_relu_layer->getOutput(0)),
+                             nvinfer1::ElementWiseOperation::kSUM);
+    PADDLE_ENFORCE(nullptr != output_layer);
+    // keep alpha tensor to avoid release it's memory
+    std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
+    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
+                   engine_->weight_map.end());
+    engine_->weight_map[alpha_name] = std::move(alpha_tensor);
+
+    std::string layer_name = "leaky_relu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    output_layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, output_layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    output_layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -61,7 +61,7 @@ class OpConverter {
      // TODO(xingzhaolong): all mul, sub, div
      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
      // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
      int op_type_len = op_desc.Type().size();
      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * PRelu converter from fluid to tensorRT.
+ */
+class PReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
+    //
+    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
+    PADDLE_ENFORCE_NOT_NULL(alpha_var);
+    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
+
+    platform::CUDAPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+        new framework::LoDTensor());
+    alpha_tensor_device->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
+    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+
+    // Transform alpha to TensorRTEngine::Weight
+    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(alpha_data),
+                                    alpha_tensor_device->numel());
+    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
+    nvinfer1::IPluginLayer* layer =
+        engine_->AddPlugin(&input, input_num, plugin);
+    // keep alpha tensor to avoid release it's memory
+    engine_->weight_map[op_desc.Input("Alpha")[0]] =
+        std::move(alpha_tensor_device);
+
+    std::string layer_name = "prelu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(40) << "convert a fluid split op to tensorrt split layer";
+    VLOG(4) << "convert a fluid split op to tensorrt split layer";

    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
@@ -50,7 +50,7 @@ class SplitOpConverter : public OpConverter {
    PADDLE_ENFORCE(output_lengths.size() == output_num);

    //
-    SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
+    plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);


--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"

+USE_OP(conv2d);
+USE_OP(conv2d_transpose);
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -51,7 +54,37 @@ TEST(conv2d_op, test) {
  validator.Execute(3);
 }

+TEST(conv2d_transpose_op, test) {
+  std::unordered_set<std::string> parameters({"deconv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
+  validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d_transpose");
+  desc.SetInput("Input", {"deconv2d-X"});
+  desc.SetInput("Filter", {"deconv2d-Y"});
+  desc.SetOutput("Output", {"deconv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(conv2d);
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -20,13 +20,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-TEST(elementwise_op, add_weight_test) {
+TEST(elementwise_op, add_weight) {
  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
  framework::Scope scope;
  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));

  // Prepare Op description
@@ -44,30 +43,65 @@ TEST(elementwise_op, add_weight_test) {
  validator.Execute(8);
 }

-TEST(elementwise_op, add_tensor_test) {
-  std::unordered_set<std::string> parameters;
-  framework::Scope scope;
-  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
-  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
-  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
-  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
-
-  // Prepare Op description
-  framework::OpDesc desc;
-  desc.SetType("elementwise_add");
-  desc.SetInput("X", {"elementwise_add-X"});
-  desc.SetInput("Y", {"elementwise_add-Y"});
-  desc.SetOutput("Out", {"elementwise_add-Out"});
-
-  // the defalut axis of elementwise op is -1
-
-  validator.SetOp(*desc.Proto());
+TEST(elementwise_op, native) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 3, 3));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
+}

-  validator.Execute(8);
+TEST(elementwise_op, plugin) {
+  for (std::string type : {"add", "mul"}) {
+    int batch_size = 8;
+    std::unordered_set<std::string> parameters;
+    framework::Scope scope;
+    TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15);
+    validator.DeclInputVar("elementwise_" + type + "-X",
+                           nvinfer1::DimsCHW(10, 3, 3));
+    validator.DeclInputVar("elementwise_" + type + "-Y",
+                           nvinfer1::Dims3(10, 1, 1));
+    validator.DeclOutputVar("elementwise_" + type + "-Out",
+                            nvinfer1::DimsCHW(10, 3, 3));
+
+    // Prepare Op description
+    framework::OpDesc desc;
+    desc.SetType("elementwise_" + type);
+    desc.SetInput("X", {"elementwise_" + type + "-X"});
+    desc.SetInput("Y", {"elementwise_" + type + "-Y"});
+    desc.SetOutput("Out", {"elementwise_" + type + "-Out"});
+
+    int axis = -1;
+    desc.SetAttr("axis", axis);
+
+    validator.SetOp(*desc.Proto());
+    validator.Execute(batch_size);
+  }
 }

 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
 USE_OP(elementwise_add);
+USE_OP(elementwise_mul);
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(leaky_relu_op, test_leaky_relu) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("leaky_relu");
+  desc.SetInput("X", {"leaky_relu_input"});
+  desc.SetOutput("Out", {"leaky_relu_out"});
+
+  desc.SetAttr("alpha", 0.1f);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+// USE_OP(leaky_relu);
+USE_OP(leaky_relu);
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(prelu_op, test_channel_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+
+  desc.SetAttr("mode", std::string("channel"));
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+TEST(prelu_op, test_element_wise) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+
+  desc.SetAttr("mode", std::string("element"));
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+TEST(prelu_op, test_scalar) {
+  std::unordered_set<std::string> parameters({"prelu_alpha"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
+  validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("prelu");
+  desc.SetInput("X", {"prelu_input"});
+  desc.SetInput("Alpha", {"prelu_alpha"});
+  desc.SetOutput("Out", {"prelu_out"});
+
+  desc.SetAttr("mode", std::string("all"));
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+// USE_OP(prelu);
+USE_CPU_ONLY_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
 Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
+                 name);
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
 }
@@ -256,9 +257,10 @@ void TensorRTEngine::freshDeviceId() {
 }

 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
-    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRT *plugin) {
  owned_plugin_.emplace_back(plugin);
-  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
+  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase {
  // Weight is model parameter.
  class Weight {
   public:
+    Weight() = default;
    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
      w_.type = dtype;
      w_.values = value;
@@ -127,7 +128,7 @@ class TensorRTEngine : public EngineBase {
  int GetRuntimeBatch();
  int GetDevice() { return device_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                    int nbInputs, PluginTensorRT*);
+                                    int num_inputs, plugin::PluginTensorRT*);

  // A pointer to CPU memory is needed of the TRT weight.
  // Before TRT runs, fluid loads weight into GPU storage.
@@ -170,7 +171,7 @@ class TensorRTEngine : public EngineBase {

  // The specific GPU id that the TensorRTEngine bounded to.
  int device_;
-  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;

  // TensorRT related internal members
  template <typename T>

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           DEPS enforce tensorrt_engine)
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+namespace details {
+
+template <typename T>
+struct Add {
+  __device__ T operator()(const T& a, const T& b) const { return a + b; }
+};
+
+template <typename T>
+struct Mul {
+  __device__ T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+template <typename T, typename Operator>
+__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out,
+                                 int batch_size, int num_rows, int num_cols) {
+  for (int batch_id = 0; batch_id < batch_size; ++batch_id) {
+    int row = blockIdx.x;
+    for (; row < num_rows; row += gridDim.x) {
+      T value_y = y[batch_id * num_rows + row];
+      int col = threadIdx.x;
+      int offset = (batch_id * num_rows + row) * num_cols;
+      for (; col < num_cols; col += blockDim.x) {
+        T value_x = x[offset + col];
+        out[offset + col] = op(value_x, value_y);
+      }
+    }
+  }
+}
+
+template <typename T, typename Operator>
+static void ElementWise(Operator op, const T* x, const T* y, T* out,
+                        int batch_size, int prev, int midd, int post,
+                        cudaStream_t stream) {
+  const int kThreadsPerBlock = 1024;
+  const int kMaximumBlocks = 65535;
+  if (prev == 1) {
+    int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
+                                                : (((post + 31) >> 5) << 5);
+    int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
+    ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
+        op, x, y, out, batch_size, midd, post);
+  } else if (post == 1) {
+    PADDLE_THROW("Not implemented.");
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+}
+
+}  // namespace details
+
+nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(index, 0);
+  PADDLE_ENFORCE_EQ(num_inputs, 2);
+  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  return input_dims[0];
+}
+
+int ElementWisePlugin::initialize() {
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+
+  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
+  int trimed_nb_dims = dims_y_.nbDims;
+  for (; trimed_nb_dims > 0; --trimed_nb_dims) {
+    if (dims_y_.d[trimed_nb_dims - 1] != 1) {
+      break;
+    }
+  }
+  dims_y_.nbDims = trimed_nb_dims;
+
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
+  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+
+  prev_size_ = 1;
+  midd_size_ = 1;
+  post_size_ = 1;
+  for (int i = 0; i < axis_; ++i) {
+    prev_size_ *= dims_x_.d[i];
+  }
+
+  for (int i = 0; i < dims_y_.nbDims; ++i) {
+    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
+                      "Broadcast dimension mismatch.");
+    midd_size_ *= dims_y_.d[i];
+  }
+
+  for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) {
+    post_size_ *= dims_x_.d[i];
+  }
+  return 0;
+}
+
+int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
+                               void** outputs, void* workspace,
+                               cudaStream_t stream) {
+  const float* x = reinterpret_cast<const float*>(inputs[0]);
+  const float* y = reinterpret_cast<const float*>(inputs[1]);
+  float* out = reinterpret_cast<float*>(outputs[0]);
+
+  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
+                         prev_size_, midd_size_, post_size_, stream);
+  } else {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class ElementWisePlugin : public PluginTensorRT {
+ public:
+  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
+                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
+                    int axis)
+      : type_(type),
+        dims_x_(dims_x),
+        dims_y_(dims_y),
+        axis_(axis),
+        prev_size_(1),
+        midd_size_(1),
+        post_size_(1) {}
+
+  ElementWisePlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &dims_x_);
+    DeserializeValue(&serial_data, &serial_length, &dims_y_);
+  }
+
+  ElementWisePlugin *clone() const override {
+    // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
+    return nullptr;
+  }
+
+  const char *getPluginType() const override { return "elementwise"; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+
+  // execute the layer
+  int enqueue(int batch_size, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream);
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(axis_) + SerializedSize(dims_x_) +
+           SerializedSize(dims_y_) + getBaseSerializationSize();
+  }
+
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, axis_);
+    SerializeValue(&buffer, dims_x_);
+    SerializeValue(&buffer, dims_y_);
+  }
+
+  nvinfer1::ElementWiseOperation type_;
+  nvinfer1::Dims dims_x_;
+  nvinfer1::Dims dims_y_;
+  int axis_;
+  int prev_size_;
+  int midd_size_;
+  int post_size_;
+};
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <cassert>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
+                                       float *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float *out = output + offset;
+  float scale = alpha[blockIdx.x % channel];
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
+                                       float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  const float *scale = alpha + offset;
+  float *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+
+__global__ void PReluScalarKernel(const float *input, const float *alpha,
+                                  float *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const float *in = input + offset;
+  float scale = *alpha;
+  float *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    float x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+static inline void PReluChannelWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, dims.d[0], spatial_size);
+}
+
+static inline void PReluElementWise(cudaStream_t stream, const float *input,
+                                    const float *alpha, float *output,
+                                    int batch_size,
+                                    const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+static inline void PReluScalar(cudaStream_t stream, const float *input,
+                               const float *alpha, float *output,
+                               int batch_size, const nvinfer1::Dims &dims) {
+  size_t unroll = batch_size * dims.d[0];
+  size_t spatial_size = dims.d[1] * dims.d[2];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputDims,
+                                                int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index < this->getNbOutputs());
+  nvinfer1::Dims const &input_dims = inputDims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+
+int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  // input dims is CHW.
+  const auto &input_dims = this->getInputDims(0);
+  const float *input = reinterpret_cast<const float *>(inputs[0]);
+  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  float *output = reinterpret_cast<float **>(outputs)[0];
+  if (mode_ == "channel") {
+    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+  } else if (mode_ == "element") {
+    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+  } else {
+    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class PReluPlugin : public PluginTensorRT {
+  TensorRTEngine::Weight alpha_;
+  std::string mode_;
+
+ protected:
+  size_t getSerializationSize() override {
+    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
+    return 0;
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    // serializeBase(buffer);
+    // SerializeValue(&buffer, alpha_);
+    // SerializeValue(&buffer, mode_);
+  }
+
+ public:
+  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
+      : alpha_(alpha), mode_(mode) {}
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  PReluPlugin(void const *serialData, size_t serialLength) {
+    // deserializeBase(serialData, serialLength);
+    // DeserializeValue(&serialData, &serialLength, &alpha_);
+    // DeserializeValue(&serialData, &serialLength, &mode_);
+  }
+
+  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+
+  const char *getPluginType() const override { return "prelu"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -14,10 +14,15 @@

 #pragma once

-#include <cassert>
 #include <cstring>
 #include <type_traits>
 #include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {

 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);
@@ -26,7 +31,7 @@ template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                             T* value);

-namespace {
+namespace details {

 template <typename T, class Enable = void>
 struct Serializer {};
@@ -36,10 +41,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
                                             std::is_enum<T>::value ||
                                             std::is_pod<T>::value>::type> {
  static size_t SerializedSize(T const& value) { return sizeof(T); }
+
  static void Serialize(void** buffer, T const& value) {
    std::memcpy(*buffer, &value, sizeof(T));
    reinterpret_cast<char*&>(*buffer) += sizeof(T);
  }
+
  static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
    assert(*buffer_size >= sizeof(T));
    std::memcpy(value, *buffer, sizeof(T));
@@ -51,10 +58,12 @@ struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
 template <>
 struct Serializer<const char*> {
  static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
+
  static void Serialize(void** buffer, const char* value) {
-    std::strcpy(static_cast<char*>(*buffer), value);
+    std::strcpy(static_cast<char*>(*buffer), value);  // NOLINT
    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
  }
+
  static void Deserialize(void const** buffer, size_t* buffer_size,
                          const char** value) {
    *value = static_cast<char const*>(*buffer);
@@ -73,39 +82,46 @@ struct Serializer<std::vector<T>,
  static size_t SerializedSize(std::vector<T> const& value) {
    return sizeof(value.size()) + value.size() * sizeof(T);
  }
+
  static void Serialize(void** buffer, std::vector<T> const& value) {
    SerializeValue(buffer, value.size());
    size_t nbyte = value.size() * sizeof(T);
    std::memcpy(*buffer, value.data(), nbyte);
    reinterpret_cast<char*&>(*buffer) += nbyte;
  }
+
  static void Deserialize(void const** buffer, size_t* buffer_size,
                          std::vector<T>* value) {
    size_t size;
    DeserializeValue(buffer, buffer_size, &size);
    value->resize(size);
    size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
    std::memcpy(value->data(), *buffer, nbyte);
    reinterpret_cast<char const*&>(*buffer) += nbyte;
    *buffer_size -= nbyte;
  }
 };

-}  // namespace
+}  // namespace details

 template <typename T>
 inline size_t SerializedSize(T const& value) {
-  return Serializer<T>::SerializedSize(value);
+  return details::Serializer<T>::SerializedSize(value);
 }

 template <typename T>
 inline void SerializeValue(void** buffer, T const& value) {
-  return Serializer<T>::Serialize(buffer, value);
+  return details::Serializer<T>::Serialize(buffer, value);
 }

 template <typename T>
 inline void DeserializeValue(void const** buffer, size_t* buffer_size,
                             T* value) {
-  return Serializer<T>::Deserialize(buffer, buffer_size, value);
+  return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
 }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <stdio.h>
-#include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {

-nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims* inputDims,
-                                                int nbInputs) {
-  assert(nbInputs == 1);
-  assert(index < this->getNbOutputs());
-  nvinfer1::Dims const& input_dims = inputDims[0];
-  nvinfer1::Dims output_dims = input_dims;
+nvinfer1::Dims SplitPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+  PADDLE_ENFORCE_EQ(num_inputs, 1);
+  PADDLE_ENFORCE_LT(index, this->getNbOutputs());
+
+  nvinfer1::Dims output_dims = input_dims[0];
  output_dims.d[axis_] = output_length_.at(index);
  return output_dims;
 }

 int SplitPlugin::initialize() {
+  PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
+
  std::vector<int> segment_offsets(1, 0);
  for (int i = 0; i < this->getNbOutputs(); ++i) {
    segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
@@ -76,6 +76,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
  return cudaGetLastError() != cudaSuccess;
 }

-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,61 +14,58 @@

 #pragma once

+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {

 class SplitPlugin : public PluginTensorRT {
-  int axis_;
-  std::vector<int> output_length_;
-  int nx_, ny_, nz_;
-  std::vector<int> segment_offsets_;
+ public:
+  SplitPlugin(int axis, std::vector<int> const &output_lengths)
+      : axis_(axis), output_length_(output_lengths) {}
+
+  SplitPlugin(void const *serial_data, size_t serial_length) {
+    deserializeBase(serial_data, serial_length);
+    DeserializeValue(&serial_data, &serial_length, &axis_);
+    DeserializeValue(&serial_data, &serial_length, &output_length_);
+  }
+
+  SplitPlugin *clone() const override {
+    return new SplitPlugin(axis_, output_length_);
+  }
+
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims *input_dims,
+                                     int num_inputs) override;
+
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;

 protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
    return SerializedSize(axis_) + SerializedSize(output_length_) +
           getBaseSerializationSize();
  }

-  // TRT will call this func when we need to serialize the configuration of
-  // tensorrt.
-  // It should not be called by users.
-  virtual void serialize(void *buffer) override {
+  void serialize(void *buffer) override {
    serializeBase(buffer);
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, output_length_);
  }

- public:
-  SplitPlugin(int axis, std::vector<int> const &output_lengths)
-      : axis_(axis), output_length_(output_lengths) {
-    assert(axis <= nvinfer1::Dims::MAX_DIMS);
-  }
-
-  // It was used for tensorrt deserialization.
-  // It should not be called by users.
-  SplitPlugin(void const *serialData, size_t serialLength) {
-    deserializeBase(serialData, serialLength);
-    DeserializeValue(&serialData, &serialLength, &axis_);
-    DeserializeValue(&serialData, &serialLength, &output_length_);
-  }
-
-  SplitPlugin *clone() const override {
-    return new SplitPlugin(axis_, output_length_);
-  }
-
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
+  int axis_;
+  std::vector<int> output_length_;
+  int nx_, ny_, nz_;
+  std::vector<int> segment_offsets_;
 };

-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -17,6 +17,7 @@
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {

 void PluginTensorRT::serializeBase(void*& buffer) {
  SerializeValue(&buffer, input_dims_);
@@ -25,12 +26,12 @@ void PluginTensorRT::serializeBase(void*& buffer) {
  SerializeValue(&buffer, data_format_);
 }

-void PluginTensorRT::deserializeBase(void const*& serialData,
-                                     size_t& serialLength) {
-  DeserializeValue(&serialData, &serialLength, &input_dims_);
-  DeserializeValue(&serialData, &serialLength, &max_batch_size_);
-  DeserializeValue(&serialData, &serialLength, &data_type_);
-  DeserializeValue(&serialData, &serialLength, &data_format_);
+void PluginTensorRT::deserializeBase(void const*& serial_data,
+                                     size_t& serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &input_dims_);
+  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
+  DeserializeValue(&serial_data, &serial_length, &data_type_);
+  DeserializeValue(&serial_data, &serial_length, &data_format_);
 }

 size_t PluginTensorRT::getBaseSerializationSize() {
@@ -44,18 +45,17 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
          (format == nvinfer1::PluginFormat::kNCHW));
 }

-void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
-                                         int nbInputs,
-                                         const nvinfer1::Dims* outputDims,
-                                         int nbOutputs, nvinfer1::DataType type,
-                                         nvinfer1::PluginFormat format,
-                                         int maxBatchSize) {
+void PluginTensorRT::configureWithFormat(
+    const nvinfer1::Dims* input_dims, int num_inputs,
+    const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
+    nvinfer1::PluginFormat format, int max_batch_size) {
  data_type_ = type;
  data_format_ = format;
-  input_dims_.assign(inputDims, inputDims + nbInputs);
-  max_batch_size_ = maxBatchSize;
+  input_dims_.assign(input_dims, input_dims + num_inputs);
+  max_batch_size_ = max_batch_size;
 }

+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -14,23 +14,30 @@

 #pragma once

-#include <cassert>
+#include <NvInfer.h>
 #include <cstring>
-#include <iostream>
 #include <unordered_map>
 #include <vector>
-#include "NvInfer.h"

 #include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(profile);

 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace plugin {

 class PluginTensorRT : public nvinfer1::IPluginExt {
 public:
  PluginTensorRT() {}
+  // It was used for TensorRT deserialization.
+  // It should not be called by users.
  PluginTensorRT(const void* serialized_data, size_t length) {}
+  virtual ~PluginTensorRT() {}
+
  nvinfer1::Dims const& getInputDims(int index) const {
    return input_dims_.at(index);
  }
@@ -38,43 +45,66 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
  nvinfer1::DataType getDataType() const { return data_type_; }
  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
  virtual const char* getPluginVersion() const { return "1"; }
+
+  void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); }
+  std::vector<nvinfer1::ITensor*>& GetInputs() { return inputs_; }
+
+  virtual nvinfer1::IPluginExt* clone() const = 0;
+  virtual const char* getPluginType() const = 0;
+
+  // Following functions are inherit from nvinfer1::IPluginExt
+  // Get the number of outputs from the layer
+  int getNbOutputs() const { return 1; }
+  // Get the dimension of an output tensor
+  virtual nvinfer1::Dims getOutputDimensions(int index,
+                                             const nvinfer1::Dims* input_dims,
+                                             int num_inputs) = 0;
+  // Find the workspace size required by the layer
  size_t getWorkspaceSize(int) const override { return 0; }
+
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+  // Shutdown the layer. This is called when the engine is destroyed
  void terminate() override {}
-  virtual ~PluginTensorRT() {}
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() = 0;
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) = 0;
+
  // Check format support. The default is FLOAT32 and NCHW.
  bool supportsFormat(nvinfer1::DataType type,
                      nvinfer1::PluginFormat format) const override;
-  void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
-                           const nvinfer1::Dims* outputDims, int nbOutputs,
+  // Configure the layer
+  void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
+                           const nvinfer1::Dims* output_dims, int num_outputs,
                           nvinfer1::DataType type,
                           nvinfer1::PluginFormat format,
-                           int maxBatchSize) override;
-
-  // *NOTE* The following functions need to be overrided in the subclass.
-  virtual nvinfer1::IPluginExt* clone() const = 0;
-  virtual const char* getPluginType() const = 0;
-  // Initialize the layer for execution. This is called when the engine is
-  // created.
-  int initialize() override { return 0; }
-  // Serialize the layer config to buffer.
-  virtual void serialize(void* buffer) = 0;
-  virtual size_t getSerializationSize() = 0;
-  virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
-                      void* workspace, cudaStream_t stream) = 0;
+                           int max_batch_size) override;

 protected:
  // Deserialize input_dims, max_batch_size, data_type, data_format
-  void deserializeBase(void const*& serialData, size_t& serialLength);
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
  size_t getBaseSerializationSize();
  // Serialize input_dims, max_batch_size, data_type, data_format
-  void serializeBase(void*& buffer);
+  void serializeBase(void*& buffer);  // NOLINT

  std::vector<nvinfer1::Dims> input_dims_;
  size_t max_batch_size_;
  nvinfer1::DataType data_type_;
  nvinfer1::PluginFormat data_format_;
+
+  std::vector<nvinfer1::ITensor*> inputs_;
 };

+}  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)

+if(WITH_GPU AND TENSORRT_FOUND)
+    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
+endif()
+
 function(download_model install_dir model_name)
    if (NOT EXISTS ${install_dir})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
@@ -27,14 +31,14 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
 endfunction()

 # RNN1
-if(NOT APPLE)
+if(NOT APPLE AND WITH_MKLML)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
 else()
-    # TODO: fix this test on MACOS, the reason is that
-    # fusion_seqexpand_concat_fc_op is not supported on MACOS
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
+    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
 endif()

 # RNN2
@@ -75,9 +79,13 @@ endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)

 # resnet50
-inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")

+# mobilenet with depthwise_conv op
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
+
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   # anakin rnn1
@@ -85,15 +93,15 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
           DEPS inference_anakin_api_shared SERIAL)
   # anakin mobilenet
   if(WITH_GPU)
       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
@@ -105,6 +113,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
   endif()
   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
+      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -51,7 +51,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
    LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
    return;
  }
-  LOG(INFO) << *config;
+  LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config);
 }

 void CompareResult(const std::vector<PaddleTensor> &outputs,
@@ -222,19 +222,36 @@ void TestMultiThreadPrediction(
      // The inputs of each thread are all the same.
      std::vector<PaddleTensor> outputs_tid;
      auto &predictor = predictors[tid];
-      LOG(INFO) << "running thread " << tid;
-      Timer timer;
-      timer.tic();
-      for (int i = 0; i < num_times; i++) {
-        for (const auto &input : inputs) {
-          ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+
+      // warmup run
+      LOG(INFO) << "Running thread " << tid << ", warm up run...";
+      {
+        Timer warmup_timer;
+        warmup_timer.tic();
+        predictor->Run(inputs[0], outputs, batch_size);
+        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+        if (FLAGS_profile) {
+          paddle::platform::ResetProfiler();
        }
+#endif
      }

-      auto time = timer.toc();
-      total_time += time;
-      PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
-                inputs.size());
+      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+      {
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          for (const auto &input : inputs) {
+            ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+          }
+        }
+
+        auto time = timer.toc();
+        total_time += time;
+        PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
+                  inputs.size());
+      }
    });
  }
  for (int i = 0; i < num_threads; ++i) {

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -145,5 +145,3 @@ TEST(TensorRT_mobilenet, analysis) {

 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(tensorrt_subgraph_pass);
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
 add_subdirectory(detail)
-
-cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
+add_subdirectory(allocation)
+cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
 cc_library(memcpy SRCS memcpy.cc DEPS place)

 cc_library(memory
        DEPS
        malloc
        memcpy)
-
-cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
-
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
+cc_library(allocator SRCS allocator.cc DEPS place)
+cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
+cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
+cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
+cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
+cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
+cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
+
+if (WITH_GPU)
+  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+endif()
+
+cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
+
+if (WITH_GPU)
+    nv_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+                 best_fit_allocator_test.cu
+            DEPS best_fit_allocator
+                 locked_allocator
+                 cpu_allocator
+                 cuda_allocator
+                 device_context
+                 memcpy)
+else()
+    cc_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+            DEPS best_fit_allocator
+                 locked_allocator
+                 cpu_allocator)
+endif()
+
+nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+if (WITH_GPU)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
+else ()
+    set(AllocatorFacadeDeps)
+endif()
+
+cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
+cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
+cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS
+        ${AllocatorFacadeDeps}
+        cpu_allocator
+        locked_allocator
+        best_fit_allocator
+        aligned_allocator
+        auto_increment_allocator
+        zero_size_allocator
+        conditional_allocator
+        retry_allocator
+        buffered_allocator
+        allocator_strategy
+        legacy_allocator
+        )
+
+nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
+
+cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
+
+cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+ThinAlignedAllocator::ThinAlignedAllocator(
+    std::shared_ptr<Allocator> underlyning_allocator)
+    : underlying_allocator_(std::move(underlyning_allocator)) {}
+
+bool ThinAlignedAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// The aligned allocation and allocator will wrap a managed allocator,
+// and returns the aligned pointer.
+//
+// NOTE(yy): For speed reason, I just use a template parameter to get
+// alignment, however, it can be an private member if necessary.
+//
+// NOTE(yy): kAlignment must be 2^N. a `static_assert` should be added.
+template <size_t kAlignment>
+class AlignedAllocation : public Allocation {
+  static_assert(kAlignment > 0 && (kAlignment & (kAlignment - 1)) == 0,
+                "kAlignment must be 2^N");
+
+ public:
+  AlignedAllocation(AllocationPtr&& underlying_allocation, size_t size)
+      : Allocation(AlignedPtr(underlying_allocation->ptr()),
+                   size + kAlignment - Offset(underlying_allocation->ptr()),
+                   underlying_allocation->place()),
+        underlying_allocation_(std::move(underlying_allocation)) {}
+
+ private:
+  static void* AlignedPtr(void* ptr) {
+    return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) +
+                                   Offset(ptr));
+  }
+
+  // Offset to aligned pointer.
+  // if ptr is already aligned, returns 0.
+  static size_t Offset(void* ptr) {
+    auto ptr_addr = reinterpret_cast<intptr_t>(ptr);
+    intptr_t aligned_addr = (ptr_addr & ~(kAlignment - 1));
+    intptr_t diff = aligned_addr - ptr_addr;
+    if (diff == 0) {
+      return 0;
+    } else {
+      return kAlignment + diff;
+    }
+  }
+
+  AllocationPtr underlying_allocation_;
+};
+
+// Thin aligned allocator is trivial and used to generate a small size binary.
+//
+// NOTE(yy): This is a trick to make a template class. This class extract the
+// common code into a `thin` class. So if there are multiple specification of
+// the template class, the binary size will not extended too much.
+//
+// NOTE(yy): This could be an over design. If it harms readability of code, it
+// could be removed later.
+class ThinAlignedAllocator : public Allocator {
+ public:
+  explicit ThinAlignedAllocator(
+      std::shared_ptr<Allocator> underlyning_allocator);
+
+  bool IsAllocThreadSafe() const;
+
+ protected:
+  std::shared_ptr<Allocator> underlying_allocator_;
+};
+
+// An aligned allocator will allocate `size+kAlignment` allocation and adjust
+// the pointer offset.
+template <size_t kAlignment>
+class AlignedAllocator : public ThinAlignedAllocator {
+ public:
+  using ThinAlignedAllocator::ThinAlignedAllocator;
+
+ protected:
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    auto raw_allocation =
+        underlying_allocator_->Allocate(size + kAlignment, attr);
+    return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
+  }
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+++ b/paddle/fluid/memory/allocation/allocation_and_eigen_test.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// NOTE(yy): this unittest is not important. It just used for debugging.
+// It can be removed later.
+struct FillZero {
+ public:
+  float* ptr_;
+
+  __device__ void operator()(size_t i) { ptr_[i] = 0.0f; }
+};
+
+namespace paddle {
+TEST(Eigen, main) {
+  framework::Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  float* ptr = tensor.mutable_data<float>({10, 10}, gpu);
+  auto& dev_ctx = *reinterpret_cast<platform::CUDADeviceContext*>(
+      platform::DeviceContextPool::Instance().Get(gpu));
+  PADDLE_ENFORCE(cudaMemset(ptr, 0, sizeof(float) * 100));
+
+  platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, 100);
+  for_range(FillZero{ptr});
+  dev_ctx.Wait();
+
+  auto eigen_vec = framework::EigenVector<float>::Flatten(tensor);
+  auto& eigen_dev = *dev_ctx.eigen_device();
+  eigen_vec.device(eigen_dev) = eigen_vec.constant(0.0f);
+}
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocation_with_underlying.h
+++ b/paddle/fluid/memory/allocation/allocation_with_underlying.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class AllocationWithUnderlying : public Allocation {
+ public:
+  explicit AllocationWithUnderlying(AllocationPtr allocation)
+      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+        allocation_(std::move(allocation)) {}
+  AllocationPtr allocation_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+Allocation::~Allocation() {}
+
+Allocator::~Allocator() {}
+
+bool Allocator::IsAllocThreadSafe() const { return false; }
+
+AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
+  auto ptr = AllocateImpl(size, attr);
+  ptr->set_allocator(this);
+  return AllocationPtr(ptr);
+}
+
+void Allocator::Free(Allocation* allocation) { delete allocation; }
+
+const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
+
+void AllocationDeleter::operator()(Allocation* allocation) const {
+  auto* allocator = allocation->allocator();
+  allocator->Free(allocation);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Exception when `Alloc`/`AllocShared` failed
+class BadAlloc : public std::exception {
+ public:
+  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
+  const char* what() const noexcept override;
+
+ private:
+  std::string msg_;
+};
+
+class Allocation;
+class AllocationDeleter {
+ public:
+  void operator()(Allocation* allocation) const;
+};
+
+class Allocator;
+// Allocation is the object holding the actually pointer. Use
+// `Allocation::ptr()` will returns the pointer that allocated.
+//
+// NOTE: this is the base class of Allocation. Each allocator can use its own
+//       allocation object.
+// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
+class Allocation {
+ public:
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
+
+  Allocation(const Allocation& o) = delete;
+  Allocation& operator=(const Allocation& o) = delete;
+
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const { return size_; }
+
+  const platform::Place& place() const { return place_; }
+
+  Allocator* allocator() { return allocator_; }
+
+  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
+
+  virtual ~Allocation();
+
+ private:
+  Allocator* allocator_;
+  void* ptr_;
+  size_t size_;
+  platform::Place place_;
+};
+
+using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+
+// Base interface class of memory Allocator.
+// To allocate a memory, allocator needs two parameters:
+//    1. size of bytes.
+//    2. Attribute of memory.
+// NOTE: the attribute of memory might be ignored if the allocator does not
+// care it.
+class Allocator {
+ public:
+  enum Attr {
+    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
+                   // algorithm.
+
+    kFixedHuge = 1,  // The allocation may not be freed until the program
+                     // ends. e.g., `Parameters` and `Momentum`.
+
+    kFluxHuge = 2,  // The allocation may create and freed frequently and the
+                    // allocation is considerable huge. Like `activations`
+                    // and gradients.
+
+    kScratchpad =
+        3,  // The `Scratchpad` memory is allocated and freed very soon,
+            // usually within an operator or aux memory.
+            // Like CUDNN workspace, AUX memory in batch norm, etc.
+            //
+            // https://en.wikipedia.org/wiki/Scratchpad_memory
+
+    kCrossDevice =
+        4,  // The memory used cross-device memory copy/communication.
+            // For example:
+            // 1. it can use an `pinned` memory for CPU-GPU
+            //    communication.
+            // 2. it can use an `registered` memory for RDMA
+            //    communication.
+
+    NumOfAttrs = 5  // The number of all attributes. It is used internally.
+  };
+
+  virtual ~Allocator();
+
+  // Allocate an allocation.
+  AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
+
+  // True if the `Allocate` is thread safe.
+  virtual bool IsAllocThreadSafe() const;
+
+ protected:
+  virtual void Free(Allocation* allocation);
+  virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
+
+ private:
+  friend class AllocationDeleter;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include <gflags/gflags.h>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
+#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/conditional_allocator.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+#include "paddle/fluid/memory/allocation/legacy_allocator.h"
+#include "paddle/fluid/memory/allocation/locked_allocator.h"
+#include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
+
+DEFINE_int64(
+    gpu_allocator_retry_time, 0,
+    "The retry time (milliseconds) when allocator fails "
+    "to allocate memory. No retry if this value is not greater than 0");
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// TODO(yy): Dirty code here. This class should be configurable in runtime.
+class CPUManagedAllocator : public Allocator {
+ public:
+  CPUManagedAllocator() : normal_allocator_(new CPUAllocator()) {}
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    return normal_allocator_->Allocate(size, attr).release();
+  }
+
+ private:
+  std::shared_ptr<Allocator> normal_allocator_;
+};
+
+// TODO(yy): Dirty code here. This class should be configurable in runtime.
+class ChunkedAllocator : public Allocator {
+ public:
+  explicit ChunkedAllocator(std::unique_ptr<Allocator> system_allocator,
+                            size_t max_chunk_size, size_t capacity = 1,
+                            int64_t retry_time = -1)
+      : max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
+    raw_allocator_ = std::move(system_allocator);
+
+    if (max_chunk_size_ == 0) {
+      default_allocator_ = raw_allocator_;
+    } else {
+      if (capacity == 1) {
+        VLOG(10) << "Create BestFitAllocator with chunk_size "
+                 << max_chunk_size_;
+        default_allocator_ = CreateAllocatorWithChunk();
+      } else {
+        VLOG(10) << "Create AutoIncrementAllocator with chunk_size "
+                 << max_chunk_size_ << " and capacity " << capacity;
+        default_allocator_ = std::make_shared<AutoIncrementAllocator>(
+            [this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
+      }
+    }
+
+    auto* cond_allocator = new ConditionalAllocator();
+    cond_allocator
+        ->AddAllocator(
+            [this](size_t size, Attr attr) { return size < max_chunk_size_; },
+            default_allocator_)
+        .AddAllocator(
+            [](size_t size, Attr attr) {
+              return true;  // default case
+            },
+            raw_allocator_);
+    default_allocator_.reset(cond_allocator);
+  }
+
+  ~ChunkedAllocator() override {
+    // Specify destruct order.
+    default_allocator_.reset();
+    chunks_.clear();
+    raw_allocator_.reset();
+  }
+
+  std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
+    chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
+    auto* allocation = chunks_.back().get();
+    std::unique_ptr<Allocator> allocator(new LockedAllocator(
+        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
+
+    if (retry_time_ > 0) {
+      auto* retry_allocator =
+          new RetryAllocator(std::move(allocator), retry_time_);
+      allocator.reset(retry_allocator);
+    }
+
+    return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
+  }
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    return default_allocator_->Allocate(size, attr).release();
+  }
+
+ protected:
+  size_t max_chunk_size_;
+  int64_t retry_time_;
+  std::vector<AllocationPtr> chunks_;
+  std::shared_ptr<Allocator> raw_allocator_;
+  std::shared_ptr<Allocator> default_allocator_;
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+class CUDAChunkedAllocator : public ChunkedAllocator {
+ public:
+  explicit CUDAChunkedAllocator(int dev_id)
+      : ChunkedAllocator(std::unique_ptr<Allocator>(
+                             new CUDAAllocator(platform::CUDAPlace(dev_id))),
+                         GetMaxChunkSize(dev_id), GetCapcity(dev_id),
+                         GetRetryTime()) {}
+
+ private:
+  static size_t GetMaxChunkSize(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    return platform::GpuMaxChunkSize();
+  }
+
+  static size_t GetCapcity(int dev_id) {
+    platform::CUDADeviceGuard guard(dev_id);
+    size_t available, total;
+    platform::GpuMemoryUsage(&available, &total);
+    size_t max_chunk_size = platform::GpuMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : available / max_chunk_size;
+  }
+
+  static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
+};
+
+class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
+ public:
+  CUDAPinnedChunkedAllocator()
+      : ChunkedAllocator(std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
+                         platform::CUDAPinnedMaxChunkSize(), GetCapacity(),
+                         -1) {}  // never retry
+
+ private:
+  static size_t GetCapacity() {
+    size_t total = platform::CpuTotalPhysicalMemory();
+    size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
+    return max_chunk_size == 0 ? 0 : total / max_chunk_size;
+  }
+};
+
+#endif
+
+class AllocatorFacadePrivate {
+ public:
+  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
+
+  ~AllocatorFacadePrivate() = default;
+
+  AllocatorFacadePrivate() {
+    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
+      InitLegacyAllocator();
+    } else {
+      InitCPUAllocator();
+      InitCUDAAllocator();
+      InitCUDAPinnedAllocator();
+      WrapZeroSizeAllocator();
+    }
+  }
+
+ private:
+  void InitLegacyAllocator() {
+    std::vector<platform::Place> places{platform::CPUPlace()};
+#ifdef PADDLE_WITH_CUDA
+    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+      places.emplace_back(platform::CUDAPlace(dev_id));
+    }
+    places.emplace_back(platform::CUDAPinnedPlace());
+#endif
+    for (auto& p : places) {
+      allocators_[p] = std::make_shared<LegacyAllocator>(p);
+    }
+  }
+
+  void InitCPUAllocator() {
+    allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
+  }
+
+  void InitCUDAAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    int device_count = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      allocators_[platform::CUDAPlace(dev_id)] =
+          std::make_shared<CUDAChunkedAllocator>(dev_id);
+    }
+#endif
+  }
+
+  void InitCUDAPinnedAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CUDAPinnedChunkedAllocator>();
+#endif
+  }
+
+  void WrapZeroSizeAllocator() {
+    for (auto& pair : allocators_) {
+      pair.second =
+          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
+    }
+  }
+};
+
+// Pimpl. Make interface clean.
+AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
+AllocatorFacade::~AllocatorFacade() { delete m_; }
+
+AllocatorFacade& AllocatorFacade::Instance() {
+  static AllocatorFacade instance;
+  return instance;
+}
+
+std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+    const platform::Place& place, size_t size, Allocator::Attr attr) {
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
+                                     AllocationDeleter());
+}
+
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     Allocator::Attr attr) {
+  auto it = m_->allocators_.find(place);
+  if (it == m_->allocators_.end()) {
+    throw BadAlloc(
+        string::Sprintf("No such allocator for the place, %s", place));
+  }
+  return m_->allocators_.at(place)->Allocate(size, attr);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Allocator Facade is the interface exposed to other modules.
+// All the configuration or dirty code under development should
+// be hidden behind this facade.
+//
+// NOTE(yy): This class is a singleton class.
+// NOTE(yy): To create a stable ABI and make compilation faster. Here we use
+// a Pimpl trick;
+class AllocatorFacadePrivate;
+class AllocatorFacade {
+ public:
+  ~AllocatorFacade();
+  AllocatorFacade(const AllocatorFacade& o) = delete;
+  const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
+
+  static AllocatorFacade& Instance();
+
+  // Allocate a shared allocation.
+  std::shared_ptr<Allocation> AllocShared(
+      const platform::Place& place, size_t size,
+      Allocator::Attr attr = Allocator::kDefault);
+
+  // Allocate a unique allocation.
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      Allocator::Attr attr = Allocator::kDefault);
+
+  // TODO(yy): Allocate a Copy-On-Write allocation?
+ private:
+  AllocatorFacade();
+  AllocatorFacadePrivate* m_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(allocator, allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.cc
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
--- a/paddle/fluid/memory/allocation/conditional_allocator.cc
+++ b/paddle/fluid/memory/allocation/conditional_allocator.cc
--- a/paddle/fluid/memory/allocation/conditional_allocator.h
+++ b/paddle/fluid/memory/allocation/conditional_allocator.h
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
--- a/paddle/fluid/memory/malloc_test.cc
+++ b/paddle/fluid/memory/malloc_test.cc
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
--- a/paddle/fluid/operators/logical_op.cu
+++ b/paddle/fluid/operators/logical_op.cu
--- a/paddle/fluid/operators/logical_op.h
+++ b/paddle/fluid/operators/logical_op.h
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cc
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/csp/CMakeLists.txt
+++ b/paddle/fluid/operators/csp/CMakeLists.txt
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
--- a/paddle/fluid/operators/fake_init_op.cc
+++ b/paddle/fluid/operators/fake_init_op.cc
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cc
--- a/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.cu.cc
--- a/paddle/fluid/operators/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/ref_by_trainer_id_op.h
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise_max_op.cu
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise_min_op.cu
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise_pow_op.h
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_gru_op.h
+++ b/paddle/fluid/operators/fusion_gru_op.h
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fusion_seqconv_eltadd_relu_op.h
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/metrics/CMakeLists.txt
+++ b/paddle/fluid/operators/metrics/CMakeLists.txt
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
--- a/paddle/fluid/operators/precision_recall_op.h
+++ b/paddle/fluid/operators/precision_recall_op.h
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
--- a/paddle/fluid/operators/optimizers/CMakeLists.txt
+++ b/paddle/fluid/operators/optimizers/CMakeLists.txt
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
--- a/paddle/fluid/operators/adadelta_op.cu
+++ b/paddle/fluid/operators/adadelta_op.cu
--- a/paddle/fluid/operators/adadelta_op.h
+++ b/paddle/fluid/operators/adadelta_op.h
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
--- a/paddle/fluid/operators/adagrad_op.h
+++ b/paddle/fluid/operators/adagrad_op.h
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
--- a/paddle/fluid/operators/adam_op.cu
+++ b/paddle/fluid/operators/adam_op.cu
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
--- a/paddle/fluid/operators/adamax_op.cu
+++ b/paddle/fluid/operators/adamax_op.cu
--- a/paddle/fluid/operators/adamax_op.h
+++ b/paddle/fluid/operators/adamax_op.h
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/decayed_adagrad_op.h
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
--- a/paddle/fluid/operators/ftrl_op.cu
+++ b/paddle/fluid/operators/ftrl_op.cu
--- a/paddle/fluid/operators/ftrl_op.h
+++ b/paddle/fluid/operators/ftrl_op.h
--- a/paddle/fluid/operators/lars_momentum_op.cc
+++ b/paddle/fluid/operators/lars_momentum_op.cc
--- a/paddle/fluid/operators/lars_momentum_op.cu
+++ b/paddle/fluid/operators/lars_momentum_op.cu
--- a/paddle/fluid/operators/lars_momentum_op.h
+++ b/paddle/fluid/operators/lars_momentum_op.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/proximal_adagrad_op.h
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
--- a/paddle/fluid/operators/proximal_gd_op.cu
+++ b/paddle/fluid/operators/proximal_gd_op.cu
--- a/paddle/fluid/operators/proximal_gd_op.h
+++ b/paddle/fluid/operators/proximal_gd_op.h
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
--- a/paddle/fluid/operators/rmsprop_op.cu
+++ b/paddle/fluid/operators/rmsprop_op.cu
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_max_op.cc
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
--- a/paddle/fluid/operators/reduce_max_op.part.cu
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
--- a/paddle/fluid/operators/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_mean_op.cc
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_mean_op.h
+++ b/paddle/fluid/operators/reduce_mean_op.h
--- a/paddle/fluid/operators/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
--- a/paddle/fluid/operators/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_min_max_op.h
--- a/paddle/fluid/operators/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_min_op.cc
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
--- a/paddle/fluid/operators/reduce_min_op.part.cu
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
--- a/paddle/fluid/operators/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_op_function.h
--- a/paddle/fluid/operators/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_prod_op.cc
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
--- a/paddle/fluid/operators/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_prod_op.h
--- a/paddle/fluid/operators/reduce_prod_op.part.cu
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
--- a/paddle/fluid/operators/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
--- a/paddle/fluid/operators/selu_op.cu
+++ b/paddle/fluid/operators/selu_op.cu
--- a/paddle/fluid/operators/selu_op.h
+++ b/paddle/fluid/operators/selu_op.h
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cu.cc
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
--- a/paddle/fluid/operators/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
--- a/paddle/fluid/operators/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
--- a/paddle/fluid/operators/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
--- a/paddle/fluid/operators/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_expand_as_op.cc
--- a/paddle/fluid/operators/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_expand_as_op.cu
--- a/paddle/fluid/operators/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_mask_op.cu
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
--- a/paddle/fluid/operators/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_pad_op.cu
--- a/paddle/fluid/operators/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_pad_op.h
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_pool_op.cu
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_reshape_op.cu
--- a/paddle/fluid/operators/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_reshape_op.h
--- a/paddle/fluid/operators/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
--- a/paddle/fluid/operators/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
--- a/paddle/fluid/operators/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_reverse_op.h
--- a/paddle/fluid/operators/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_scatter_op.cc
--- a/paddle/fluid/operators/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_scatter_op.h
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_unpad_op.cc
--- a/paddle/fluid/operators/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_unpad_op.cu
--- a/paddle/fluid/operators/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_unpad_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.cu.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cu.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/cuda_device_guard.cc
+++ b/paddle/fluid/platform/cuda_device_guard.cc
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ b/paddle/fluid/platform/cuda_device_guard.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/lock_guard_ptr.h
+++ b/paddle/fluid/platform/lock_guard_ptr.h
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/v2/dataset/wmt16.py
--- a/python/setup.py.in
+++ b/python/setup.py.in