1. Add ANAKIN_ROOT compile option

2. refine trt code test=develop

1. Add ANAKIN_ROOT compile option
2. refine trt code test=develop
f3a2e4b3 · nhzlx · 4f4daa4b · f3a2e4b3 · f3a2e4b3 · f3a2e4b3
36 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,6 @@ option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
-option(WITH_ANAKIN_SUBGRAPH "Compile with Anakin subgraph library"      OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
@@ -192,6 +191,7 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+    include(anakin_subgraph)
 endif()
 if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)

--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
+find_path(ANAKIN_INCLUDE_DIR anakin_config.h
+    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
+    PATHS ${ANAKIN_ROOT}
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to ANAKIN library.")
+
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
+    set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+if(ANAKIN_FOUND)
+    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT}/include)
+    include_directories(${ANAKIN_ROOT}/include/saber)
+    link_directories(${ANAKIN_ROOT})
+    add_definitions(-DPADDLE_WITH_ANAKIN)
+endif()
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
    include_directories(${TENSORRT_INCLUDE_DIR})
+    link_directories(${TENSORRT_LIBRARY})
    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
  add_subdirectory(tensorrt)
 endif()

-if (WITH_ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
  add_subdirectory(anakin)
 endif()


--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
-cc_library(anakin_engine SRCS engine.cc)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
 cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)

--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
 cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
 elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)

-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
-cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op)
-cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
-cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
-cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)
-cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
-cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
-cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
-cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op)
-cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
-cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
+cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
+cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
+cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
+cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
+cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
+cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
+#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col)
+cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
  PADDLE_ENFORCE(converter != nullptr);
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("act-X", {10, 6, 1, 1});
  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
  framework::OpDesc desc;

--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -24,7 +24,7 @@ TEST(batch_norm_op, test) {
      {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
       "batch_norm_variance"});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  std::vector<int> param_shape{2};

  validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5});

--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(concat_op, test) {
  std::unordered_set<std::string> parameters({""});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("concat_x1", {1, 2, 1, 1});
  validator.DeclInputVar("concat_x2", {1, 3, 1, 1});
  validator.DeclInputVar("concat_x3", {1, 1, 1, 1});
@@ -47,7 +47,7 @@ TEST(concat_op, test) {
 TEST(concat_op, test2) {
  std::unordered_set<std::string> parameters({""});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("concat_x1", {1, 4});
  validator.DeclInputVar("concat_x2", {3, 4});
  validator.DeclInputVar("concat_x3", {2, 4});

--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@@ -27,7 +27,7 @@ TEST(conv2d_op, test) {
  ASSERT_TRUE(conv2d_converter != nullptr);
  std::unordered_set<std::string> parameters({"conv2d-Y"});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("conv2d-X", {1, 3, 3, 3});
  validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1});
  validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3});

--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(dropout_op, native) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("x", {1, 1, 2, 2});
  validator.DeclOutputVar("out", {1, 1, 2, 2});
  validator.DeclOutputVar("mask", {1, 1, 2, 2});

--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 static void test_elementwise_op(const std::string &op_type) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("x", {1, 1, 2, 2});
  validator.DeclInputVar("y", {1, 1, 2, 2});
  validator.DeclOutputVar("out", {1, 1, 2, 2});

--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -26,7 +26,7 @@ TEST(fc_op, test) {

  std::unordered_set<std::string> parameters({"mul_y"});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("mul_x", {1, 1, 2, 2});
  validator.DeclParamVar("mul_y", {4, 2});
  validator.DeclOutputVar("mul_out", {1, 2});

--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -26,7 +26,7 @@ TEST(flatten_op, test) {

  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
  validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
  framework::OpDesc desc;

--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
@@ -24,7 +24,7 @@ namespace anakin {
 TEST(im2sequence_op, native) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  std::vector<int> kernels = {6, 1};
  std::vector<int> strides = {1, 1};

--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -27,7 +27,7 @@ void test_pool2d(bool global_pooling, bool ceil_mode,

  framework::Scope scope;
  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  // The ITensor's Dims should not contain the batch size.
  // So, the ITensor's Dims of input and output should be C * H * W.
@@ -72,7 +72,7 @@ void test_pool2d2(bool global_pooling, bool ceil_mode,

  framework::Scope scope;
  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  // The ITensor's Dims should not contain the batch size.
  // So, the ITensor's Dims of input and output should be C * H * W.

--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@@ -26,7 +26,7 @@ static void test_activation_op(const std::string &op_type) {
  PADDLE_ENFORCE(converter != nullptr);
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("act-X", {10, 6, 1, 1});
  validator.DeclOutputVar("act-Out", {10, 6, 1, 1});
  framework::OpDesc desc;

--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -25,7 +25,7 @@ TEST(reshape, test) {
  ASSERT_TRUE(converter);
  framework::Scope scope;
  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  // validator.DeclInputVar("reshape-X", {2, 3, 3, 1});
  // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3});
@@ -48,7 +48,7 @@ TEST(reshape, test) {
 TEST(reshape, test2) {
  framework::Scope scope;
  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  validator.DeclInputVar("reshape-X", {1, 2, 4});
  validator.DeclOutputVar("reshape-Out", {1, 4, 2});

--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -25,10 +25,10 @@ TEST(softmax, test) {
  ASSERT_TRUE(converter);
  framework::Scope scope;
  std::unordered_set<std::string> parameters;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

-  validator.DeclInputVar("softmax-X", {1, 10});
-  validator.DeclOutputVar("softmax-Out", {1, 10});
+  validator.DeclInputVar("softmax-X", {1, 10, 2});
+  validator.DeclOutputVar("softmax-Out", {1, 10, 2});

  framework::OpDesc desc;
  desc.SetType("softmax");

--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
@@ -26,7 +26,7 @@ void AnakinSliceTest(const std::vector<int> &in_shape,
                     const std::vector<int> &sections) {
  std::unordered_set<std::string> parameters({""});
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);

  validator.DeclInputVar("split_input", in_shape);
  std::vector<std::string> output_vars;

--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
@@ -25,7 +25,7 @@ namespace anakin {
 TEST(sum, native) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("sum_x1", {1, 2, 1, 2});
  validator.DeclInputVar("sum_x2", {1, 2, 1, 2});
  validator.DeclOutputVar("sum_out", {1, 2, 1, 2});

--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -25,7 +25,7 @@ TEST(transpose_op, test) {
  ASSERT_TRUE(converter != nullptr);
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("transpose-X", {2, 3, 4, 5});
  validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3});

@@ -47,7 +47,7 @@ TEST(transpose_op, test) {
 TEST(transpose_op, test2) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
-  AnakinConvertValidation validator(parameters, scope);
+  AnakinConvertValidation validator(parameters, &scope);
  validator.DeclInputVar("transpose-X", {3, 4, 5});
  validator.DeclOutputVar("transpose-Out", {3, 5, 4});


--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -84,7 +84,7 @@ class AnakinConvertValidation {
  AnakinConvertValidation() = delete;

  AnakinConvertValidation(const std::unordered_set<std::string>& parameters,
-                          framework::Scope& scope)
+                          framework::Scope* scope)
      : parameters_(parameters), scope_(scope), place_(0) {
    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
    engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
@@ -108,7 +108,7 @@ class AnakinConvertValidation {

  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
    platform::CUDADeviceContext ctx(place_);
-    auto* x = scope_.Var(name);
+    auto* x = scope_->Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
    RandomizeTensor(x_tensor, place_, ctx);
@@ -120,13 +120,13 @@ class AnakinConvertValidation {
    // should init anakin engine here.

    Singleton<AnakinOpConverter>::Global().ConvertOp(
-        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+        desc, parameters_, *scope_, engine_.get(), true /*test_mode*/);
    engine_->Freeze();

    std::map<std::string, std::vector<int>> temp_max_input_shape;
    for (const auto& input : op_desc_->InputArgumentNames()) {
      if (parameters_.count(input)) continue;
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(*scope_,
                                                                        input);
      auto t_shape = framework::vectorize2int(t.dims());
      while (t_shape.size() < 4) {
@@ -147,14 +147,14 @@ class AnakinConvertValidation {
               std::unordered_set<std::string> neglected_output = {}) {
    // Execute Fluid Op
    platform::CUDADeviceContext ctx(place_);
-    op_->Run(scope_, place_);
+    op_->Run(*scope_, place_);

    // std::vector<framework::LoDTensor> input_vector;
    // std::vector<framework::LoDTensor> output_vector;
    std::map<std::string, framework::LoDTensor*> inputs;
    for (const auto& input : op_desc_->InputArgumentNames()) {
      if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
+      auto* var = scope_->FindVar(input);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      inputs.insert({input, tensor});
    }
@@ -164,7 +164,7 @@ class AnakinConvertValidation {
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
      std::vector<float> fluid_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &fluid_out);
      fluid_outputs.push_back(fluid_out);
@@ -177,7 +177,7 @@ class AnakinConvertValidation {
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
      std::vector<float> anakin_out;
-      auto* var = scope_.FindVar(output);
+      auto* var = scope_->FindVar(output);
      auto tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &anakin_out);

@@ -189,15 +189,13 @@ class AnakinConvertValidation {
    }
  }

-  framework::Scope& scope() { return scope_; }
-
 private:
  std::unique_ptr<AnakinNvEngineT> engine_{nullptr};
  cudaStream_t stream_;
  std::unique_ptr<framework::OperatorBase> op_;
  std::unique_ptr<framework::OpDesc> op_desc_;
  const std::unordered_set<std::string>& parameters_;
-  framework::Scope& scope_;
+  framework::Scope* scope_;
  platform::CUDAPlace place_;
 };


--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -97,7 +97,11 @@ void IRPassManager::CreatePasses(Argument *argument,

      bool use_static_engine = argument->tensorrt_use_static_engine();
      bool model_from_memory = argument->model_from_memory();
-      if ((!model_from_memory && use_static_engine)) {
+      bool int8_valid = !(model_from_memory && enable_int8);
+      PADDLE_ENFORCE(int8_valid,
+                     "TRT INT8 Now don't support model load from memory.");
+
+      if ((!model_from_memory && use_static_engine) || enable_int8) {
        std::string model_opt_cache_dir =
            argument->Has("model_dir")
                ? argument->model_dir()

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
-cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc)
 if(WITH_TESTING)
  add_dependencies(subgraph_detector gtest)
 endif()
@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()

-if (WITH_ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
  cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)

  set(analysis_deps ${analysis_deps}

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -35,9 +35,6 @@ namespace analysis {

 using framework::ir::Node;

-std::vector<std::string> ExtractAnakinParameters(
-    const std::unordered_set<Node *> &nodes);
-
 std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
@@ -51,11 +48,10 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
  fuser();

  std::vector<std::string> graph_param_names =
-      ExtractAnakinParameters(graph->Nodes());
+      ExtractParameters(graph->Nodes());

  // those parameter already exist in anakin, and should not have another copy
-  // in
-  // fluid.
+  // in fluid.
  std::vector<std::string> repetitive_params;

  for (auto *node : graph->Nodes()) {
@@ -157,74 +153,13 @@ void AnakinSubgraphPass::CreateAnakinOp(
  op_desc->SetType("anakin_engine");

  std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();

  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
-  // Why we do this?
-  // During the transition from fluid OP to anakin OP, we map
-  // the input and output Tensor(fluid data structure) of fluid OP
-  // to the corresponding ITensor (trt data structure) through the
-  // Tensor name. When we set up ITensor for an variable, we must
-  // ensure that it has not been set before.
-  // If there is variable in the fluid graph, which is not only the
-  // input of a OP, but also the output of a Op, there will be problems.
-  // So we have to rename the variable in the subgraph to make sure
-  // it is either an OP's input or an OP's output.
-
-  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map,
+                      false);

  // When anakin engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
@@ -249,8 +184,7 @@ void AnakinSubgraphPass::CreateAnakinOp(
  SetAttr(op_desc->Proto(), "subgraph",
          block_desc.Proto()->SerializeAsString());
  // Set attrs
-  SetAttr(op_desc->Proto(), "parameters",
-          ExtractAnakinParameters(graph->Nodes()));
+  SetAttr(op_desc->Proto(), "parameters", params);
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
  int predictor_id = Get<int>("predictor_id");
  auto engine_key = GenerateAnakinEngineKey(
@@ -277,34 +211,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
          param_set, output_mapping, anakin_engine);
 }

-std::vector<std::string> ExtractAnakinParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
-    }
-  }
-
-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsVar()) continue;
-    if (node->Var()->Persistable() &&
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
-            feed_outputs.end()) {
-      parameters.push_back(node->Name());
-    }
-  }
-  return parameters;
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/anakin/engine.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"

 using anakin::Precision;
 using anakin::saber::NV;

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include <algorithm>
+#include <string>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+
+  std::vector<std::string> parameters;
+  for (const auto &node : nodes) {
+    if (!node->IsVar()) continue;
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
+      parameters.push_back(node->Name());
+    }
+  }
+  return parameters;
+}
+
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    if (op_desc.Type() == "conv2d" && is_trt) {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+
+std::vector<std::string> ExtractParameters(
+    const std::unordered_set<Node *> &nodes);
+
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map,
+    bool is_trt = true);
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -31,17 +31,6 @@ namespace analysis {

 using framework::ir::Node;

-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map);
-
 std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
@@ -217,7 +206,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // Get "" when there is no cached calibration table data.
  bool load_from_memory = Get<bool>("model_from_memory");
  std::string calibration_data = "";
-  if (!load_from_memory && use_static_engine) {
+  if (enable_int8) {
    calibration_data = GetTrtCalibTableData(
        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
  }
@@ -226,13 +215,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
  std::string trt_engine_serialized_data = "";
-  if (load_from_memory) {
-    std::map<std::string, std::string> engine_opt_info =
-        Get<std::map<std::string, std::string>>("engine_opt_info");
-    if (engine_opt_info.count(engine_key)) {
-      trt_engine_serialized_data = engine_opt_info[engine_key];
-    }
-  }
+
  SetAttr(op_desc->Proto(), "engine_serialized_data",
          trt_engine_serialized_data);

@@ -240,176 +223,62 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  if (enable_int8 && calibration_data.size() != 0) {
    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
  }
-
  // When in int8 mode and calibration_mode, the program just produce the
  // calibration table data.
  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
-  if (!calibration_mode && use_static_engine &&
-      trt_engine_serialized_data.empty()) {
-    std::copy(params.begin(), params.end(),
-              std::back_inserter(*repetitive_params));
-
-    if (use_static_engine && !load_from_memory) {
-      trt_engine_serialized_data = GetTrtEngineSerializedData(
-          Get<std::string>("model_opt_cache_dir"), engine_key);
-    }
+  if (calibration_mode) {
+    // calibraion mode means generate int8 calibration table data process.
+    return;
+  }

-    if (trt_engine_serialized_data.empty()) {
-      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-                   "kernel etc). This process may cost a lot of time.";
-      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
-          new tensorrt::TensorRTEngine(
-              Get<int>("max_batch_size"), Get<int>("workspace_size"),
-              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
-      auto *scope = param_scope();
-      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
-      std::unordered_set<std::string> param_set(params.begin(), params.end());
-      inference::Singleton<inference::tensorrt::OpConverter>::Global()
-          .ConvertBlockToTRTEngine(
-              &block_desc_temp, *scope,
-              std::vector<std::string>(input_names.begin(), input_names.end()),
-              param_set, output_mapping, trt_engine.get());
-      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-      trt_engine_serialized_data =
-          std::string((const char *)serialized_engine_data->data(),
-                      serialized_engine_data->size());
-
-      if (use_static_engine && !load_from_memory) {
-        SaveTrtEngineSerializedDataToFile(
-            GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
-                                       engine_key),
-            trt_engine_serialized_data);
-      }
-    } else {
+  std::copy(params.begin(), params.end(),
+            std::back_inserter(*repetitive_params));
+  bool need_serialize = (use_static_engine && !load_from_memory);
+
+  if (need_serialize) {
+    trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+    // we can load the engine info serialized before from the disk.
+    if (!trt_engine_serialized_data.empty()) {
+      SetAttr(op_desc->Proto(), "engine_serialized_data",
+              trt_engine_serialized_data);
      LOG(INFO) << "Load TRT Optimized Info from "
                << GetTrtEngineSerializedPath(
                       Get<std::string>("model_opt_cache_dir"), engine_key);
-    }
-
-    SetAttr(op_desc->Proto(), "engine_serialized_data",
-            trt_engine_serialized_data);
-  }
-}
-
-std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
-  // We can judge whether a variable is a parameter by
-  // its presistable property, but sometimes the presistable
-  // of the feed op output is true, so we have to identify it.
-  std::vector<std::string> feed_outputs;
-  for (const auto &node : nodes) {
-    if (!node->IsOp()) continue;
-    std::string op_type = node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") {
-      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
-      std::copy(output_names.begin(), output_names.end(),
-                std::back_inserter(feed_outputs));
+      return;
    }
  }

-  std::vector<std::string> parameters;
-  for (const auto &node : nodes) {
-    if (!node->IsVar()) continue;
-    if (node->Var()->Persistable() &&
-        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
-            feed_outputs.end()) {
-      parameters.push_back(node->Name());
-    }
-  }
-  return parameters;
-}
-
-void RenameAndGetOutputs(
-    const std::vector<framework::ir::Node *> &subgraph_nodes,
-    framework::BlockDesc *block_desc,
-    const std::set<std::string> &input_names_with_id,
-    std::set<std::string> *output_names_with_id,
-    std::set<std::string> *output_names,
-    std::unordered_map<std::string, std::string> *output_name_map) {
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      same_hierarchy_conv2d_num_map;
-
-  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
-    framework::OpDesc op_desc(*op, nullptr);
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    std::unordered_map<std::string, framework::ir::Node *> in_vars;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-      in_vars[in_var->Name()] = in_var;
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    if (op_desc.Type() == "conv2d") {
-      auto input_var_name = op_desc.Input("Input").front();
-      auto filter_var_name = op_desc.Input("Filter").front();
-      auto out_var_name = op_desc.Output("Output").front();
-      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
-      const std::vector<int> strides =
-          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
-      const std::vector<int> paddings =
-          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
-      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
-        (*output_names_with_id)
-            .insert(out_var_name + std::to_string(var2id[out_var_name]));
-        (*output_names).insert(out_var_name);
-      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
-                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
-                 paddings[1] == 0) {
-        same_hierarchy_conv2d_num_map[input_var_name] += 1;
-      }
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id->count(arg_value_with_id)) {
-          (*output_name_map)[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
+  // the following code will NOT run in following situation:
+  // 1. calibraion mode (generate trt int8 calibraiton table data)
+  // 2. already load serialized trt engine info.
+  LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+               "kernel etc). This process may cost a lot of time.";
+  std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+      new tensorrt::TensorRTEngine(
+          Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
+          calibrator.get(), Get<int>("gpu_device_id")));
+  auto *scope = param_scope();
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  inference::Singleton<inference::tensorrt::OpConverter>::Global()
+      .ConvertBlockToTRTEngine(
+          &block_desc_temp, *scope,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
+          param_set, output_mapping, trt_engine.get());
+  nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+  trt_engine_serialized_data =
+      std::string((const char *)serialized_engine_data->data(),
+                  serialized_engine_data->size());
+
+  if (need_serialize) {
+    SaveTrtEngineSerializedDataToFile(
+        GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                   engine_key),
+        trt_engine_serialized_data);
  }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"

 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,7 +27,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()

-if (WITH_ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
    set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
 endif()


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -40,10 +40,11 @@
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
-
 #endif

+#if PADDLE_WITH_ANAKIN
 #include "paddle/fluid/inference/anakin/convert/op_converter.h"
+#endif

 DECLARE_bool(profile);

@@ -817,6 +818,7 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif

+#if PADDLE_WITH_ANAKIN
 USE_ANAKIN_CONVERTER(mul);
 USE_ANAKIN_CONVERTER(fc);
 USE_ANAKIN_CONVERTER(conv2d);
@@ -838,3 +840,4 @@ USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
 USE_ANAKIN_CONVERTER(dropout);
 USE_ANAKIN_CONVERTER(sum);
+#endif
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -81,6 +81,8 @@ PaddleDType ZeroCopyTensor::type() {
    return PaddleDType::FLOAT32;
  } else if (type == framework::proto::VarType::INT64) {
    return PaddleDType::INT64;
+  } else if (type == framework::proto::VarType::INT32) {
+    return PaddleDType::INT32;
  } else {
    LOG(ERROR) << "unknown type, only support float32 and int64 now.";
  }

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
    add_subdirectory(tensorrt)
 endif()

-if (WITH_ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
    add_subdirectory(anakin)
 endif()


--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
@@ -39,8 +39,7 @@ class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker {

 class AnakinEngineInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };

 }  // namespace operators