From 69d37f81d7339c2f3ea1ba556fd1aa3e53b81117 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 20 Mar 2019 12:57:13 +0000
Subject: [PATCH] cherry-pick from feature/anakin-engine: refine anakin
 subgraph. #16157

support change input size
---
 .../framework/ir/graph_pattern_detector.cc    | 16 +++++-
 .../simplify_anakin_detection_pattern_pass.cc | 13 ++++-
 paddle/fluid/inference/anakin/CMakeLists.txt  |  2 +-
 .../inference/anakin/convert/batch_norm.cc    |  2 +
 .../anakin/convert/density_prior_box.cc       | 31 +++++++----
 .../inference/anakin/convert/op_converter.h   | 30 ++++++++++
 .../fluid/inference/anakin/convert/pool2d.cc  |  2 +-
 .../fluid/inference/anakin/convert/softmax.cc |  2 +-
 .../anakin/convert/test_batch_norm_op.cc      |  3 +-
 .../anakin/convert/test_pool2d_op.cc          | 41 ++++++++++++++
 .../inference/anakin/convert/ut_helper.h      |  2 +-
 paddle/fluid/inference/anakin/engine.cc       | 48 ++++++++++------
 paddle/fluid/inference/anakin/engine.h        | 45 ++++++++++++++-
 .../inference/anakin/test_anakin_engine.cc    |  7 +--
 paddle/fluid/inference/analysis/argument.h    |  6 ++
 .../inference/analysis/ir_pass_manager.cc     |  9 +++
 .../ir_passes/anakin_subgraph_pass.cc         | 55 +++++++++++++++----
 .../analysis/ir_passes/anakin_subgraph_pass.h | 10 +++-
 .../ir_passes/tensorrt_subgraph_pass.cc       | 18 +++++-
 paddle/fluid/inference/api/CMakeLists.txt     |  5 --
 paddle/fluid/inference/api/analysis_config.cc | 27 ++++++++-
 .../fluid/inference/api/analysis_predictor.cc |  9 ++-
 .../fluid/inference/api/analysis_predictor.h  |  5 +-
 .../inference/api/paddle_analysis_config.h    | 13 +++++
 .../inference/api/paddle_pass_builder.cc      | 12 ++++
 .../fluid/inference/api/paddle_pass_builder.h |  3 +
 .../fluid/operators/anakin/anakin_engine_op.h | 54 +++++++++++++-----
 27 files changed, 390 insertions(+), 80 deletions(-)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index f5d240f2c62..77c9a94df2f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1527,6 +1527,16 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
                            ->assert_is_op_output("box_coder")
                            ->AsIntermediate();
 
+  auto transpose_before_nms =
+      pattern->NewNode(GetNodeName("transpose_before_nms"))
+          ->assert_is_op("transpose2");
+
+  auto transpose_before_nms_out =
+      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
+          ->assert_is_op_output("transpose2")
+          ->assert_is_op_input("multiclass_nms", "Scores")
+          ->AsIntermediate();
+
   auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
                                ->assert_is_op("multiclass_nms")
                                ->assert_op_has_n_inputs("multiclass_nms", 2);
@@ -1577,8 +1587,10 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
       {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
   box_coder_out->LinksFrom({box_coder_op});
 
-  multiclass_nms_op
-      ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]})
+  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
+  transpose_before_nms_out->LinksFrom({transpose_before_nms});
+
+  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
       .LinksTo({multiclass_nms_out});
 
   return multiclass_nms_out;
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
index 462f077eb7a..5ab10ba39fa 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -45,7 +45,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
 
   input_nodes.push_back(gpd.mutable_pattern()
                             ->NewNode("x" + std::to_string(times + 1))
-                            ->assert_is_op_input("multiclass_nms", "Scores")
+                            ->assert_is_op_input("transpose2")
                             ->AsInput());
 
   patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
@@ -106,6 +106,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
     Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
 
     Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *transpose_before_nms =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms"));
+    Node *transpose_before_nms_out =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
+
     Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
     Node *multiclass_nms_out =
         subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
@@ -133,11 +138,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
           nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
     }
 
-    int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
     framework::OpDesc concat1_desc;
     concat1_desc.SetType("concat");
     concat1_desc.SetInput("X", concat1_input_names);
-    concat1_desc.SetAttr("axis", axis);
+    concat1_desc.SetAttr("axis", 2);
     concat1_desc.SetOutput("Out", {concat_out1->Name()});
 
     auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
@@ -184,6 +189,8 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
     delete_nodes.insert(concat_out2);
     delete_nodes.insert(box_coder_op);
     delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(transpose_before_nms);
+    delete_nodes.insert(transpose_before_nms_out);
     delete_nodes.insert(multiclass_nms);
 
     new_add_concat_op->outputs.push_back(concat_out1);
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
index 2da280ed29b..1646c7d16ec 100644
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(anakin_engine SRCS engine.cc)
-nv_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc
index ebe81dabcbc..94014802bdb 100644
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -43,11 +43,13 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
   auto output = op_desc.Output("Y").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
   auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
+  // auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));
 
   auto bn_op_name = op_name + ":bn";
   auto bn_output = bn_op_name + "_output";
   engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
   engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
+  engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));
 
   auto scale_op_name = op_name + ":scale";
   auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
index 565a95f17b1..a55c153f99a 100644
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -27,8 +27,8 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
-                                            const framework::Scope &scope,
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
+                                            const framework::Scope& scope,
                                             bool test_mode) {
   framework::OpDesc op_desc(op, nullptr);
   auto input_name = op_desc.Input("Input").front();
@@ -42,34 +42,45 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
   auto fixed_ratios =
       boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
   auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  std::vector<float> dens;
+  for (auto& ele : densities) {
+    dens.push_back(static_cast<float>(ele));
+  }
 
   // lack flip
-  auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
   auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+  for (auto& ele : variances) {
+    LOG(INFO) << ele;
+  }
 
   // lack img_h, img_w
   auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
   auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
   auto offset = boost::get<float>(op_desc.GetAttr("offset"));
-  std::vector<std::string> order = {"MIN", "COM", "MAX"};
+  PTuple<std::string> t_order;
+  t_order.push_back("MIN");
+  t_order.push_back("COM");
+  t_order.push_back("MAX");
+
   std::vector<float> temp_v = {};
 
   engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
   engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
   engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
   engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_sizes", fixed_sizes);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratios", fixed_ratios);
-  engine_->AddOpAttr<PTuple<int>>(op_name, "density", densities);
-  engine_->AddOpAttr(op_name, "is_flip", false);
-  engine_->AddOpAttr(op_name, "is_clip", clip);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
+  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
   engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
   engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "step_h", step_h);
   engine_->AddOpAttr(op_name, "step_w", step_w);
   engine_->AddOpAttr(op_name, "offset", offset);
-  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", order);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
 }
 
 }  // namespace anakin
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index eb75f1f62ab..9155a7c300a 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "framework/core/types.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -68,6 +69,35 @@ class AnakinOpConverter {
       ConvertOp(op, parameters, scope, engine);
     }
   }
+
+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToAnakinEngine(
+      framework::BlockDesc *block_desc, const framework::Scope &scope,
+      const std::vector<std::string> &inputs,
+      const std::unordered_set<std::string> &parameters,
+      const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
+    framework::proto::BlockDesc *block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    engine->Freeze();
+    for (auto &input : inputs) {
+      if (parameters.count(input)) continue;
+      auto *var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+
+      auto var_shape = var->GetShape();
+      PADDLE_ENFORCE(var_shape.size() == 4);
+      std::vector<int> input_shape;
+      for (int i = 0; i < var_shape.size(); i++) {
+        input_shape.push_back(var_shape[i]);
+      }
+      input_shape[0] = 1;
+
+      engine->SetInputShape(input, input_shape);
+    }
+    engine->Optimize();
+    engine->InitGraph();
+  }
+
   void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
   virtual ~AnakinOpConverter() {}
 
diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc
index 1b251149f3e..3dee589ac47 100644
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
@@ -55,7 +55,7 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
   if (pool_type == "max") {
     anakin_pool_type = "MAX";
   } else if (pool_type == "avg") {
-    anakin_pool_type = "AVG";
+    anakin_pool_type = "AVGEXC";
   } else {
     PADDLE_THROW("TensorRT unsupported pooling type!");
   }
diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc
index fd791ef98bc..d5cd8908ebf 100644
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
@@ -33,7 +33,7 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
   auto output = op_desc.Output("Out").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
   engine_->AddOp(op_name, "Softmax", {input}, {output});
-  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "axis", 2);
 }
 
 }  // namespace anakin
diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
index c6eebf5d0c4..2e438dd7241 100644
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -52,8 +52,9 @@ TEST(batch_norm_op, test) {
   desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
 
   float eps = 1e-5f;
+  bool is_test = true;
   desc.SetAttr("epsilon", eps);
-  desc.SetAttr("is_test", true);
+  desc.SetAttr("is_test", is_test);
 
   validator.SetOp(*desc.Proto());
 
diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
index bb47c70a071..95cb4194957 100644
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -64,11 +64,52 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
   validator.Execute(1);
 }
 
+void test_pool2d2(bool global_pooling, bool ceil_mode,
+                  std::string pool_type = "max") {
+  auto* pool2d_converter =
+      Registry<AnakinOpConverter>::Global().Lookup("pool2d");
+  ASSERT_TRUE(pool2d_converter);
+
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, scope);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
+  validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d_x"});
+  desc.SetOutput("Out", {"pool2d_out"});
+
+  std::vector<int> ksize({3, 3});
+  std::vector<int> strides({1, 1});
+  std::vector<int> paddings({1, 1});
+  std::string pooling_t = pool_type;
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", true);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(1);
+}
+
 TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
 TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
 
 TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
 TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
+TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); }
 
 }  // namespace anakin
 }  // namespace inference
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index 815a88fb489..1b0ef8c7dbe 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -168,7 +168,7 @@ class AnakinConvertValidation {
       outputs.insert({output, tensor});
     }
 
-    engine_->Execute(inputs, outputs);
+    engine_->Execute(inputs, outputs, stream_);
     int i_output = 0;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       if (neglected_output.count(output)) continue;
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index fb95bd03e07..822627b8a26 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -33,9 +33,12 @@ namespace inference {
 namespace anakin {
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary)
+AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary,
+                                                            int device)
     : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
-      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {}
+      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
+  device_ = device;
+}
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
@@ -63,33 +66,44 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
     const std::map<std::string, framework::LoDTensor *> &inputs,
-    const std::map<std::string, framework::LoDTensor *> &outputs) {
+    const std::map<std::string, framework::LoDTensor *> &outputs,
+    cudaStream_t stream) {
   for (const auto &input : inputs) {
     auto *tensor = input.second;
     auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
+    auto fluid_input_shape = framework::vectorize2int(tensor->dims());
+
     auto *anakin_input = net_->get_in(input.first);
-    auto anakin_input_shape = anakin_input->valid_shape();
-    PADDLE_ENFORCE(tensor->numel(), anakin_input_shape.count(),
-                   "the fluid input size should be equal to anakin");
+    auto net_shape = anakin_input->shape();
+    if (tensor->numel() > net_shape.count()) {
+      graph_->Reshape(input.first, fluid_input_shape);
+      net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true));
+      net_->init(*graph_);
+      anakin_input = net_->get_in(input.first);
+    }
+
+    anakin_input->reshape(fluid_input_shape);
+    net_shape = anakin_input->shape();
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_input_shape);
-    anakin_input->copy_from(tmp_anakin_tensor);
+                                                       net_shape);
+    anakin_input->share_from(tmp_anakin_tensor);
   }
 
+  net_->prediction();
   for (const auto &output : outputs) {
+    platform::CUDAPlace gpu_place(device_);
     auto *tensor = output.second;
-    auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
     auto *anakin_output = net_->get_out(output.first);
+    auto *anakin_data = anakin_output->data();
     auto anakin_output_shape = anakin_output->valid_shape();
-    PADDLE_ENFORCE(tensor->numel(), anakin_output_shape.count(),
-                   "the fluid output size should be equal to anakin");
-    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_output_shape);
-    anakin_output->share_from(tmp_anakin_tensor);
+    tensor->Resize(framework::make_ddim(anakin_output_shape));
+    auto *fluid_data = tensor->mutable_data<float>(gpu_place);
+
+    memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
+                 static_cast<void *>(anakin_data),
+                 tensor->numel() * sizeof(float), stream);
   }
-  net_->prediction();
+
   cudaDeviceSynchronize();
 }
 
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
index 408ad3b9f62..2613fc7f9b5 100644
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/engine.h"
@@ -26,8 +27,12 @@
 #include "framework/core/net/net.h"
 #include "framework/core/types.h"
 #include "framework/graph/graph.h"
+#include "framework/graph/graph_global_mem.h"
 #include "saber/saber_types.h"
 
+using anakin::Precision;
+using anakin::saber::NV;
+
 namespace anakin {
 
 template <typename, Precision, OpRunType>
@@ -50,7 +55,7 @@ class AnakinEngine {
   using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
 
  public:
-  explicit AnakinEngine(bool need_summary = false);
+  explicit AnakinEngine(bool need_summary = false, int device = 0);
   ~AnakinEngine();
   void InitGraph();
   void SetInputShape(const std::string &name, std::vector<int> shape);
@@ -69,14 +74,50 @@ class AnakinEngine {
   void Freeze();
   void Optimize();
   void Save(std::string path) { graph_->save(path); }
+  // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
+  // }
+  // void LoadSerializedData(const std::string& data) {
+  // graph_->load_from_string(data); }
   void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs);
+               const std::map<std::string, framework::LoDTensor *> &outputs,
+               cudaStream_t stream);
 
  private:
+  int device_;
   std::unique_ptr<GraphT> graph_;
   std::unique_ptr<NetT> net_;
 };
 
+class AnakinEngineManager {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+
+ public:
+  bool HasEngine(const std::string &name) const {
+    if (engines_.count(name) == 0) return false;
+    return engines_.at(name).get() != nullptr;
+  }
+  AnakinNvEngineT *Get(const std::string &name) const {
+    return engines_.at(name).get();
+  }
+
+  AnakinNvEngineT *Create(bool need_summary, int device,
+                          std::string engine_name) {
+    std::unique_lock<std::mutex> lk(mut_);
+    auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device);
+    engines_[engine_name].reset(p);
+    return p;
+  }
+
+  void DeleteALL() {
+    for (auto &item : engines_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_;
+  std::mutex mut_;
+};
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc
index 571294d3e22..8fd6b8bec9a 100644
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -17,9 +17,6 @@ limitations under the License. */
 
 #include <map>
 
-#include "framework/core/net/net.h"
-#include "framework/graph/graph.h"
-#include "framework/graph/graph_global_mem.h"
 #include "paddle/fluid/inference/anakin/engine.h"
 
 using anakin::graph::GraphGlobalMem;
@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
   auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
   std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};
 
-  engine_->Execute(inputs, outputs);
+  cudaStream_t stream;
+
+  engine_->Execute(inputs, outputs, stream);
   auto *y_data_gpu = y_data;
   float y_data_cpu[2];
   cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 321deccf867..43452906974 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -55,6 +56,7 @@ struct Argument {
 
   using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
   using fusion_statis_t = std::unordered_map<std::string, int>;
+  using engine_opt_info_t = std::map<std::string, std::string>;
 
   bool Has(const std::string& key) const { return valid_fields_.count(key); }
 
@@ -107,12 +109,14 @@ struct Argument {
  private:                                                                 \
   unique_ptr_t field__##_;
 
+  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
   // Model path
   DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
   // Model specified with program and parameters files.
   DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
   DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
+  DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t);
 
   // The overall graph to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -146,6 +150,8 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
 
+  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
+
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
   DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 07423a14855..1327d69d909 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -71,6 +72,11 @@ void IRPassManager::CreatePasses(Argument *argument,
     if (pass_name == "anakin_subgraph_pass") {
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
     }
 
     if (pass_name == "tensorrt_subgraph_pass") {
@@ -95,6 +101,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
       pass->Set("use_static_engine",
                 new bool(argument->tensorrt_use_static_engine()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
     }
 
     pre_pass = pass_name;
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index 1a4c24e800b..4b21bfe6bcf 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 #include "paddle/fluid/inference/anakin/op_teller.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
@@ -45,12 +46,20 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
     return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
   };
 
-  SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */);
+  SubGraphFuser fuser(graph.get(), teller, 0 /* min_subgraph_size */);
   fuser();
 
+  std::vector<std::string> graph_param_names =
+      ExtractAnakinParameters(graph->Nodes());
+
+  // those parameter already exist in anakin, and should not have another copy
+  // in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateAnakinOp(node, graph.get());
+      CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
       std::unordered_set<const Node *> nodes2remove(
           Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
       framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
@@ -64,13 +73,15 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
     }
   }
   framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
 
   return graph;
 }
 
-std::string GenerateAnakinEngineKey(
-    const std::set<std::string> &engine_inputs,
-    const std::set<std::string> &engine_outputs) {
+std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
+                                    const std::set<std::string> &engine_outputs,
+                                    std::string id) {
   std::string engine_hash_key = "";
   for (auto name : engine_inputs) {
     engine_hash_key += name;
@@ -78,12 +89,15 @@ std::string GenerateAnakinEngineKey(
   for (auto name : engine_outputs) {
     engine_hash_key += name;
   }
+  engine_hash_key += id;
   auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
   return engine_key;
 }
 
-void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
-                                        Graph *graph) const {
+void AnakinSubgraphPass::CreateAnakinOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
@@ -117,10 +131,16 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
   // is unique.
   std::set<std::string> input_names;
   std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
   }
+  std::copy(params.begin(), params.end(),
+            std::back_inserter(*repetitive_params));
   op_desc->SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
@@ -231,10 +251,25 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
   SetAttr(op_desc->Proto(), "parameters",
           ExtractAnakinParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
-  auto engine_key =
-      GenerateAnakinEngineKey(input_names_with_id, output_names_with_id);
+  int predictor_id = Get<int>("predictor_id");
+  auto engine_key = GenerateAnakinEngineKey(
+      input_names_with_id, output_names_with_id, std::to_string(predictor_id));
 
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
+
+  auto *anakin_engine =
+      inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
+          true, Get<int>("gpu_device_id"), engine_key);
+
+  auto *scope = param_scope();
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+
+  inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
+      .ConvertBlockToAnakinEngine(
+          &block_desc_temp, *scope,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
+          param_set, output_mapping, anakin_engine);
 }
 
 std::vector<std::string> ExtractAnakinParameters(
@@ -246,7 +281,7 @@ std::vector<std::string> ExtractAnakinParameters(
   for (const auto &node : nodes) {
     if (!node->IsOp()) continue;
     std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
       std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
       std::copy(output_names.begin(), output_names.end(),
                 std::back_inserter(feed_outputs));
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
index 5f45ff579d5..a732cb55921 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -15,8 +15,13 @@
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/anakin/engine.h"
 
+using anakin::Precision;
+using anakin::saber::NV;
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -27,8 +32,9 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
       std::unique_ptr<framework::ir::Graph> graph) const override;
 
  private:
-  void CreateAnakinOp(framework::ir::Node *x,
-                      framework::ir::Graph *graph) const;
+  void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                      const std::vector<std::string> &graph_params,
+                      std::vector<std::string> *repetitive_params) const;
   void CleanIntermediateOutputs(framework::ir::Node *node);
 };
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index d4e2da8957f..7407883d6cc 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <map>
 #include <set>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -219,7 +220,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+  bool load_from_memory = Get<bool>("model_from_memory");
+  std::string trt_engine_serialized_data = "";
+  if (load_from_memory) {
+    std::map<std::string, std::string> engine_opt_info =
+        Get<std::map<std::string, std::string>>("engine_opt_info");
+    if (engine_opt_info.count(engine_key)) {
+      trt_engine_serialized_data = engine_opt_info[engine_key];
+    }
+  }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
 
   std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
   if (enable_int8 && calibration_data.size() != 0) {
@@ -230,10 +241,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // When in int8 mode and calibration_mode, the program just produce the
   // calibration table data.
   bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
-  if (!calibration_mode && use_static_engine) {
+  if (!calibration_mode && use_static_engine &&
+      trt_engine_serialized_data.empty()) {
     std::copy(params.begin(), params.end(),
               std::back_inserter(*repetitive_params));
-    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+    trt_engine_serialized_data = GetTrtEngineSerializedData(
         Get<std::string>("model_opt_cache_dir"), engine_key);
 
     if (trt_engine_serialized_data.empty()) {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 29d21dc171c..38313754ea9 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -64,8 +64,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     anakin_target(inference_anakin_api)
     anakin_target(inference_anakin_api_shared)
 endif()
-if (WITH_ANAKIN_SUBGRAPH)
-   inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) 
-   inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) 
-   inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) 
-endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 1be25de4973..59e8f48313c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
+extern const std::vector<std::string> kAnakinSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
   if (!pass_builder_.get()) {
@@ -230,6 +231,20 @@ void AnalysisConfig::Update() {
     }
   }
 
+  if (use_anakin_) {
+    PADDLE_ENFORCE(!use_tensorrt_,
+                   "Anakin sub-graph and TensorRT sub-graph are not allowed to "
+                   "run at the same time!");
+    PADDLE_ENFORCE(
+        use_gpu_,
+        "Anakin sub-graph engine need gpu, please use the EnableGpu API.");
+
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kAnakinSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -266,7 +281,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << specify_input_name_;
   ss << cpu_math_library_num_threads_;
-
+  ss << use_anakin_;
   return ss.str();
 }
 
@@ -316,6 +331,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
   Update();
 }
 
+void AnalysisConfig::SetEngineOptInfo(
+    std::map<std::string, std::string> engine_opt_info) {
+  engine_opt_info_ = engine_opt_info;
+}
+
 NativeConfig AnalysisConfig::ToNativeConfig() const {
   NativeConfig config;
   config.model_dir = model_dir_;
@@ -332,5 +352,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
   ir_debug_ = x;
   Update();
 }
-
+void AnalysisConfig::EnableAnakinEngine() {
+  use_anakin_ = true;
+  Update();
+}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e2bb446d00c..fa6c6f500dc 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -351,7 +351,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   argument_.SetStaticMemoryOptimForceUpdate(
       config_.static_memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
+  argument_.SetEngineOptInfo(config_.engine_opt_info_);
   // Analyze inference_program
+  argument_.SetUseAnakin(config_.anakin_engine_enabled());
+  argument_.SetPredictorID(predictor_id_);
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
@@ -375,6 +378,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
   }
 
+  if (config_.use_gpu() && config_.anakin_engine_enabled()) {
+    LOG(INFO) << "Anakin subgraph engine is enabled";
+  }
+
   if (config_.use_mkldnn_) {
     LOG(INFO) << "MKLDNN is enabled";
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
@@ -404,7 +411,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu()) {
     // 1. GPU memory
-    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
     PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                       config.gpu_device_id());
     std::vector<std::string> flags;
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5c0535d63e0..087bfbd0026 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
  */
 class AnalysisPredictor : public PaddlePredictor {
  public:
-  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+  }
   ~AnalysisPredictor();
 
   bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
   const size_t max_shape_collect_count_{1000};
   int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
   std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+  int predictor_id_;
 
  private:
   // Some status here that help to determine the status inside the predictor.
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9b05c335047..80f256513e1 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -14,9 +14,11 @@
 #pragma once
 
 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 /*! \file */
@@ -140,6 +142,14 @@ struct AnalysisConfig {
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  /**
+   *  \brief Turn on the usage of Anakin sub-graph engine.
+   */
+  void EnableAnakinEngine();
+
+  /** A boolean state indicating whether the Anakin sub-graph engine is used.
+  */
+  bool anakin_engine_enabled() const { return use_anakin_; }
 
   /** \brief Control whether to debug IR graph analysis phase.
    *
@@ -185,6 +195,7 @@ struct AnalysisConfig {
   /** A boolean state telling whether the model is set from the CPU memory.
    */
   bool model_from_memory() const { return model_from_memory_; }
+  void SetEngineOptInfo(std::map<std::string, std::string> engine_opt_info);
 
   /** Turn on memory optimize
    * NOTE still in development, will release latter.
@@ -258,6 +269,8 @@ struct AnalysisConfig {
   std::string serialized_info_cache_;
 
   mutable std::unique_ptr<PassStrategy> pass_builder_;
+  bool use_anakin_{false};
+  std::map<std::string, std::string> engine_opt_info_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 22c527cfc11..f6d82a57d29 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -68,6 +68,17 @@ void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
+// The following passes works for Anakin sub-graph engine.
+const std::vector<std::string> kAnakinSubgraphPasses({
+    "infer_clean_graph_pass",                   //
+    "simplify_anakin_detection_pattern_pass3",  //
+    "fc_fuse_pass",                             //
+    "conv_elementwise_add_fuse_pass",           //
+    "conv_bn_fuse_pass",                        //
+    "conv_elementwise_add_fuse_pass",           //
+    "anakin_subgraph_pass",
+});
+
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     "infer_clean_graph_pass",                        //
@@ -120,4 +131,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   });
   use_gpu_ = false;
 }
+void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 2524d89fcd1..c93aec1a225 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,7 @@ class PaddlePassBuilder {
   /** Delete all the passes that has type `pass_type`. */
   void DeletePass(const std::string &pass_type);
 
+  void ClearPasses();
   /** Append an analysis pass. */
   void AppendAnalysisPass(const std::string &pass);
 
@@ -142,4 +143,6 @@ class GpuPassStrategy : public PassStrategy {
   virtual ~GpuPassStrategy() = default;
 };
 
+extern const std::vector<std::string> kAnakinSubgraphPasses;
+
 }  // namespace paddle
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
index 93569367ffd..7a70836652d 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 
+#include <fstream>
 #include <map>
 #include <memory>
 #include <string>
@@ -52,8 +53,9 @@ class AnakinEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
   std::unordered_set<std::string> param_names_;
-  mutable std::unique_ptr<AnakinNvEngineT> anakin_engine_;
+  mutable AnakinNvEngineT *anakin_engine_;
   std::string engine_key_;
+  std::string engine_serialized_data_;
 
  public:
   AnakinEngineOp(const std::string &type,
@@ -67,6 +69,7 @@ class AnakinEngineOp : public framework::OperatorBase {
     for (const auto &param : params) {
       param_names_.insert(param);
     }
+    anakin_engine_ = nullptr;
   }
 
  protected:
@@ -77,12 +80,12 @@ class AnakinEngineOp : public framework::OperatorBase {
 
   void RunAnakin(const framework::Scope &scope,
                  const platform::Place &dev_place) const {
-    if (anakin_engine_.get() == nullptr) {
-      anakin_engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
-      Prepare(scope, dev_place, anakin_engine_.get());
-    }
+    auto *engine = GetEngine(scope, dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    auto *engine = anakin_engine_.get();
     PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
 
     std::vector<std::string> output_maps =
@@ -95,24 +98,48 @@ class AnakinEngineOp : public framework::OperatorBase {
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
       auto t_shape = framework::vectorize(t.dims());
+      auto *anakin_input = engine->Net()->get_in(x);
+      auto net_shape = anakin_input->shape();
+      size_t anakin_net_input_size = net_shape.count() * sizeof(float);
+      size_t fluid_input_size = t.memory_size();
+
+      if (fluid_input_size < anakin_net_input_size) {
+        framework::LoDTensor temp_t;
+        auto t_dims = t.dims();
+        temp_t.Resize(t_dims);
+        TensorCopySync(t, dev_place, &temp_t);
+        t.Resize(framework::make_ddim(net_shape));
+        t.mutable_data<float>(dev_place);
+        TensorCopySync(temp_t, dev_place, &t);
+      }
       inputs.insert({x, &t});
     }
 
     std::map<std::string, framework::LoDTensor *> outputs;
     int output_index = 0;
     for (const auto &y : Outputs("Ys")) {
-      std::vector<int> ddim =
-          engine->Net()->get_out(output_maps[output_index])->valid_shape();
+      // std::vector<int> ddim =
+      //    engine->Net()->get_out(output_maps[output_index])->valid_shape();
       // we need get the output anakin output shape.
       auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
-      fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
+      // fluid_t->Resize(framework::make_ddim(ddim));
+      // fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
       outputs.insert({output_maps[output_index], fluid_t});
       output_index += 1;
     }
-    engine->Execute(inputs, outputs);
+    engine->Execute(inputs, outputs, stream);
+  }
+
+  AnakinNvEngineT *GetEngine(const framework::Scope &scope,
+                             const platform::Place &dev_place) const {
+    if (anakin_engine_ == nullptr) {
+      anakin_engine_ =
+          inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
+              .Get(engine_key_);
+    }
+    return anakin_engine_;
   }
 
   void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
@@ -128,8 +155,6 @@ class AnakinEngineOp : public framework::OperatorBase {
     inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
         .ConvertBlock(block_desc, param_names_, scope, engine);
     engine->Freeze();
-    engine->Optimize();
-
     for (const auto &x : Inputs("Xs")) {
       if (param_names_.count(x)) continue;
       auto &t =
@@ -142,6 +167,9 @@ class AnakinEngineOp : public framework::OperatorBase {
       }
       engine->SetInputShape(x, t_shape);
     }
+
+    engine->Optimize();
+
     engine->InitGraph();
   }
 };
-- 
GitLab