cherry-pick from feature/anakin-engine: refine anakin subgraph. #16157

support change input size

cherry-pick from feature/anakin-engine: refine anakin subgraph. #16157
support change input size
69d37f81 · nhzlx · a1d200a5 · 69d37f81 · 69d37f81 · 69d37f81
27 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1527,6 +1527,16 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
                           ->assert_is_op_output("box_coder")
                           ->AsIntermediate();

+  auto transpose_before_nms =
+      pattern->NewNode(GetNodeName("transpose_before_nms"))
+          ->assert_is_op("transpose2");
+
+  auto transpose_before_nms_out =
+      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
+          ->assert_is_op_output("transpose2")
+          ->assert_is_op_input("multiclass_nms", "Scores")
+          ->AsIntermediate();
+
  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
                               ->assert_is_op("multiclass_nms")
                               ->assert_op_has_n_inputs("multiclass_nms", 2);
@@ -1577,8 +1587,10 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
  box_coder_out->LinksFrom({box_coder_op});

-  multiclass_nms_op
-      ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]})
+  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
+  transpose_before_nms_out->LinksFrom({transpose_before_nms});
+
+  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
      .LinksTo({multiclass_nms_out});

  return multiclass_nms_out;

--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -45,7 +45,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(

  input_nodes.push_back(gpd.mutable_pattern()
                            ->NewNode("x" + std::to_string(times + 1))
-                            ->assert_is_op_input("multiclass_nms", "Scores")
+                            ->assert_is_op_input("transpose2")
                            ->AsInput());

  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
@@ -106,6 +106,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));

    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *transpose_before_nms =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms"));
+    Node *transpose_before_nms_out =
+        subgraph.at(pattern.GetPDNode("transpose_before_nms_out"));
+
    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
    Node *multiclass_nms_out =
        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
@@ -133,11 +138,11 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
    }

-    int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
    framework::OpDesc concat1_desc;
    concat1_desc.SetType("concat");
    concat1_desc.SetInput("X", concat1_input_names);
-    concat1_desc.SetAttr("axis", axis);
+    concat1_desc.SetAttr("axis", 2);
    concat1_desc.SetOutput("Out", {concat_out1->Name()});

    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
@@ -184,6 +189,8 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
    delete_nodes.insert(concat_out2);
    delete_nodes.insert(box_coder_op);
    delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(transpose_before_nms);
+    delete_nodes.insert(transpose_before_nms_out);
    delete_nodes.insert(multiclass_nms);

    new_add_concat_op->outputs.push_back(concat_out1);

--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
 cc_library(anakin_engine SRCS engine.cc)
-nv_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -43,11 +43,13 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
  auto output = op_desc.Output("Y").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
+  // auto momentum = boost::get<float>(op_desc.GetAttr("momentum"));

  auto bn_op_name = op_name + ":bn";
  auto bn_output = bn_op_name + "_output";
  engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
  engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
+  engine_->AddOpAttr(bn_op_name, "momentum", static_cast<float>(1.0));

  auto scale_op_name = op_name + ":scale";
  auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,

--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -27,8 +27,8 @@ namespace paddle {
 namespace inference {
 namespace anakin {

-void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
-                                            const framework::Scope &scope,
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
+                                            const framework::Scope& scope,
                                            bool test_mode) {
  framework::OpDesc op_desc(op, nullptr);
  auto input_name = op_desc.Input("Input").front();
@@ -42,34 +42,45 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
  auto fixed_ratios =
      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  std::vector<float> dens;
+  for (auto& ele : densities) {
+    dens.push_back(static_cast<float>(ele));
+  }

  // lack flip
-  auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+  for (auto& ele : variances) {
+    LOG(INFO) << ele;
+  }

  // lack img_h, img_w
  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
-  std::vector<std::string> order = {"MIN", "COM", "MAX"};
+  PTuple<std::string> t_order;
+  t_order.push_back("MIN");
+  t_order.push_back("COM");
+  t_order.push_back("MAX");
+
  std::vector<float> temp_v = {};

  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_sizes", fixed_sizes);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratios", fixed_ratios);
-  engine_->AddOpAttr<PTuple<int>>(op_name, "density", densities);
-  engine_->AddOpAttr(op_name, "is_flip", false);
-  engine_->AddOpAttr(op_name, "is_clip", clip);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
+  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
  engine_->AddOpAttr(op_name, "step_h", step_h);
  engine_->AddOpAttr(op_name, "step_w", step_w);
  engine_->AddOpAttr(op_name, "offset", offset);
-  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", order);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", t_order);
 }

 }  // namespace anakin

--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "framework/core/types.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -68,6 +69,35 @@ class AnakinOpConverter {
      ConvertOp(op, parameters, scope, engine);
    }
  }
+
+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToAnakinEngine(
+      framework::BlockDesc *block_desc, const framework::Scope &scope,
+      const std::vector<std::string> &inputs,
+      const std::unordered_set<std::string> &parameters,
+      const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
+    framework::proto::BlockDesc *block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    engine->Freeze();
+    for (auto &input : inputs) {
+      if (parameters.count(input)) continue;
+      auto *var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+
+      auto var_shape = var->GetShape();
+      PADDLE_ENFORCE(var_shape.size() == 4);
+      std::vector<int> input_shape;
+      for (int i = 0; i < var_shape.size(); i++) {
+        input_shape.push_back(var_shape[i]);
+      }
+      input_shape[0] = 1;
+
+      engine->SetInputShape(input, input_shape);
+    }
+    engine->Optimize();
+    engine->InitGraph();
+  }
+
  void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
  virtual ~AnakinOpConverter() {}


--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
@@ -55,7 +55,7 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
  if (pool_type == "max") {
    anakin_pool_type = "MAX";
  } else if (pool_type == "avg") {
-    anakin_pool_type = "AVG";
+    anakin_pool_type = "AVGEXC";
  } else {
    PADDLE_THROW("TensorRT unsupported pooling type!");
  }

--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
@@ -33,7 +33,7 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
  auto output = op_desc.Output("Out").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
  engine_->AddOp(op_name, "Softmax", {input}, {output});
-  engine_->AddOpAttr(op_name, "axis", 1);
+  engine_->AddOpAttr(op_name, "axis", 2);
 }

 }  // namespace anakin

--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -52,8 +52,9 @@ TEST(batch_norm_op, test) {
  desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});

  float eps = 1e-5f;
+  bool is_test = true;
  desc.SetAttr("epsilon", eps);
-  desc.SetAttr("is_test", true);
+  desc.SetAttr("is_test", is_test);

  validator.SetOp(*desc.Proto());


--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -64,11 +64,52 @@ void test_pool2d(bool global_pooling, bool ceil_mode,
  validator.Execute(1);
 }

+void test_pool2d2(bool global_pooling, bool ceil_mode,
+                  std::string pool_type = "max") {
+  auto* pool2d_converter =
+      Registry<AnakinOpConverter>::Global().Lookup("pool2d");
+  ASSERT_TRUE(pool2d_converter);
+
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, scope);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d_x", {1, 1, 17, 17});
+  validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d_x"});
+  desc.SetOutput("Out", {"pool2d_out"});
+
+  std::vector<int> ksize({3, 3});
+  std::vector<int> strides({1, 1});
+  std::vector<int> paddings({1, 1});
+  std::string pooling_t = pool_type;
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", true);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(1);
+}
+
 TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
 TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }

 TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
 TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
+TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); }

 }  // namespace anakin
 }  // namespace inference

--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -168,7 +168,7 @@ class AnakinConvertValidation {
      outputs.insert({output, tensor});
    }

-    engine_->Execute(inputs, outputs);
+    engine_->Execute(inputs, outputs, stream_);
    int i_output = 0;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;

--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -33,9 +33,12 @@ namespace inference {
 namespace anakin {

 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary)
+AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary,
+                                                            int device)
    : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
-      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {}
+      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
+  device_ = device;
+}

 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 AnakinEngine<TargetT, PrecisionType, RunType>::~AnakinEngine() {}
@@ -63,33 +66,44 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::AddOp(
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
    const std::map<std::string, framework::LoDTensor *> &inputs,
-    const std::map<std::string, framework::LoDTensor *> &outputs) {
+    const std::map<std::string, framework::LoDTensor *> &outputs,
+    cudaStream_t stream) {
  for (const auto &input : inputs) {
    auto *tensor = input.second;
    auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
+    auto fluid_input_shape = framework::vectorize2int(tensor->dims());
+
    auto *anakin_input = net_->get_in(input.first);
-    auto anakin_input_shape = anakin_input->valid_shape();
-    PADDLE_ENFORCE(tensor->numel(), anakin_input_shape.count(),
-                   "the fluid input size should be equal to anakin");
+    auto net_shape = anakin_input->shape();
+    if (tensor->numel() > net_shape.count()) {
+      graph_->Reshape(input.first, fluid_input_shape);
+      net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true));
+      net_->init(*graph_);
+      anakin_input = net_->get_in(input.first);
+    }
+
+    anakin_input->reshape(fluid_input_shape);
+    net_shape = anakin_input->shape();
    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_input_shape);
-    anakin_input->copy_from(tmp_anakin_tensor);
+                                                       net_shape);
+    anakin_input->share_from(tmp_anakin_tensor);
  }

+  net_->prediction();
  for (const auto &output : outputs) {
+    platform::CUDAPlace gpu_place(device_);
    auto *tensor = output.second;
-    auto *data = tensor->data<float>();
-    auto shape = framework::vectorize2int(tensor->dims());
    auto *anakin_output = net_->get_out(output.first);
+    auto *anakin_data = anakin_output->data();
    auto anakin_output_shape = anakin_output->valid_shape();
-    PADDLE_ENFORCE(tensor->numel(), anakin_output_shape.count(),
-                   "the fluid output size should be equal to anakin");
-    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       anakin_output_shape);
-    anakin_output->share_from(tmp_anakin_tensor);
+    tensor->Resize(framework::make_ddim(anakin_output_shape));
+    auto *fluid_data = tensor->mutable_data<float>(gpu_place);
+
+    memory::Copy(gpu_place, static_cast<void *>(fluid_data), gpu_place,
+                 static_cast<void *>(anakin_data),
+                 tensor->numel() * sizeof(float), stream);
  }
-  net_->prediction();
+
  cudaDeviceSynchronize();
 }


--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/engine.h"
@@ -26,8 +27,12 @@
 #include "framework/core/net/net.h"
 #include "framework/core/types.h"
 #include "framework/graph/graph.h"
+#include "framework/graph/graph_global_mem.h"
 #include "saber/saber_types.h"

+using anakin::Precision;
+using anakin::saber::NV;
+
 namespace anakin {

 template <typename, Precision, OpRunType>
@@ -50,7 +55,7 @@ class AnakinEngine {
  using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;

 public:
-  explicit AnakinEngine(bool need_summary = false);
+  explicit AnakinEngine(bool need_summary = false, int device = 0);
  ~AnakinEngine();
  void InitGraph();
  void SetInputShape(const std::string &name, std::vector<int> shape);
@@ -69,14 +74,50 @@ class AnakinEngine {
  void Freeze();
  void Optimize();
  void Save(std::string path) { graph_->save(path); }
+  // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
+  // }
+  // void LoadSerializedData(const std::string& data) {
+  // graph_->load_from_string(data); }
  void Execute(const std::map<std::string, framework::LoDTensor *> &inputs,
-               const std::map<std::string, framework::LoDTensor *> &outputs);
+               const std::map<std::string, framework::LoDTensor *> &outputs,
+               cudaStream_t stream);

 private:
+  int device_;
  std::unique_ptr<GraphT> graph_;
  std::unique_ptr<NetT> net_;
 };

+class AnakinEngineManager {
+  using AnakinNvEngineT = AnakinEngine<NV, Precision::FP32>;
+
+ public:
+  bool HasEngine(const std::string &name) const {
+    if (engines_.count(name) == 0) return false;
+    return engines_.at(name).get() != nullptr;
+  }
+  AnakinNvEngineT *Get(const std::string &name) const {
+    return engines_.at(name).get();
+  }
+
+  AnakinNvEngineT *Create(bool need_summary, int device,
+                          std::string engine_name) {
+    std::unique_lock<std::mutex> lk(mut_);
+    auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device);
+    engines_[engine_name].reset(p);
+    return p;
+  }
+
+  void DeleteALL() {
+    for (auto &item : engines_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<AnakinNvEngineT>> engines_;
+  std::mutex mut_;
+};
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -17,9 +17,6 @@ limitations under the License. */

 #include <map>

-#include "framework/core/net/net.h"
-#include "framework/graph/graph.h"
-#include "framework/graph/graph_global_mem.h"
 #include "paddle/fluid/inference/anakin/engine.h"

 using anakin::graph::GraphGlobalMem;
@@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) {
  auto *y_data = y.mutable_data<float>(platform::CUDAPlace());
  std::map<std::string, framework::LoDTensor *> outputs = {{"y", &y}};

-  engine_->Execute(inputs, outputs);
+  cudaStream_t stream;
+
+  engine_->Execute(inputs, outputs, stream);
  auto *y_data_gpu = y_data;
  float y_data_cpu[2];
  cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -55,6 +56,7 @@ struct Argument {

  using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
  using fusion_statis_t = std::unordered_map<std::string, int>;
+  using engine_opt_info_t = std::map<std::string, std::string>;

  bool Has(const std::string& key) const { return valid_fields_.count(key); }

@@ -107,12 +109,14 @@ struct Argument {
 private:                                                                 \
  unique_ptr_t field__##_;

+  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
  // Model path
  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
  // Model specified with program and parameters files.
  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
+  DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t);

  // The overall graph to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -146,6 +150,8 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                      bool);

+  DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
+
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -71,6 +72,11 @@ void IRPassManager::CreatePasses(Argument *argument,
    if (pass_name == "anakin_subgraph_pass") {
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
    }

    if (pass_name == "tensorrt_subgraph_pass") {
@@ -95,6 +101,9 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
      pass->Set("use_static_engine",
                new bool(argument->tensorrt_use_static_engine()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
    }

    pre_pass = pass_name;

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include <vector>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
 #include "paddle/fluid/inference/anakin/op_teller.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h"
@@ -45,12 +46,20 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
  };

-  SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */);
+  SubGraphFuser fuser(graph.get(), teller, 0 /* min_subgraph_size */);
  fuser();

+  std::vector<std::string> graph_param_names =
+      ExtractAnakinParameters(graph->Nodes());
+
+  // those parameter already exist in anakin, and should not have another copy
+  // in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateAnakinOp(node, graph.get());
+      CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
      std::unordered_set<const Node *> nodes2remove(
          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
@@ -64,13 +73,15 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
    }
  }
  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));

  return graph;
 }

-std::string GenerateAnakinEngineKey(
-    const std::set<std::string> &engine_inputs,
-    const std::set<std::string> &engine_outputs) {
+std::string GenerateAnakinEngineKey(const std::set<std::string> &engine_inputs,
+                                    const std::set<std::string> &engine_outputs,
+                                    std::string id) {
  std::string engine_hash_key = "";
  for (auto name : engine_inputs) {
    engine_hash_key += name;
@@ -78,12 +89,15 @@ std::string GenerateAnakinEngineKey(
  for (auto name : engine_outputs) {
    engine_hash_key += name;
  }
+  engine_hash_key += id;
  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
  return engine_key;
 }

-void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
-                                        Graph *graph) const {
+void AnakinSubgraphPass::CreateAnakinOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
  auto *op_desc = node->Op();
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
@@ -117,10 +131,16 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
  // is unique.
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
  }
+  std::copy(params.begin(), params.end(),
+            std::back_inserter(*repetitive_params));
  op_desc->SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));

@@ -231,10 +251,25 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node,
  SetAttr(op_desc->Proto(), "parameters",
          ExtractAnakinParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
-  auto engine_key =
-      GenerateAnakinEngineKey(input_names_with_id, output_names_with_id);
+  int predictor_id = Get<int>("predictor_id");
+  auto engine_key = GenerateAnakinEngineKey(
+      input_names_with_id, output_names_with_id, std::to_string(predictor_id));

  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+
+  auto *anakin_engine =
+      inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
+          true, Get<int>("gpu_device_id"), engine_key);
+
+  auto *scope = param_scope();
+  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+
+  inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
+      .ConvertBlockToAnakinEngine(
+          &block_desc_temp, *scope,
+          std::vector<std::string>(input_names.begin(), input_names.end()),
+          param_set, output_mapping, anakin_engine);
 }

 std::vector<std::string> ExtractAnakinParameters(
@@ -246,7 +281,7 @@ std::vector<std::string> ExtractAnakinParameters(
  for (const auto &node : nodes) {
    if (!node->IsOp()) continue;
    std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
      std::copy(output_names.begin(), output_names.end(),
                std::back_inserter(feed_outputs));

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
@@ -15,8 +15,13 @@
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/anakin/engine.h"

+using anakin::Precision;
+using anakin::saber::NV;
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -27,8 +32,9 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase {
      std::unique_ptr<framework::ir::Graph> graph) const override;

 private:
-  void CreateAnakinOp(framework::ir::Node *x,
-                      framework::ir::Graph *graph) const;
+  void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                      const std::vector<std::string> &graph_params,
+                      std::vector<std::string> *repetitive_params) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
 };


--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <map>
 #include <set>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -219,7 +220,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(

  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+  bool load_from_memory = Get<bool>("model_from_memory");
+  std::string trt_engine_serialized_data = "";
+  if (load_from_memory) {
+    std::map<std::string, std::string> engine_opt_info =
+        Get<std::map<std::string, std::string>>("engine_opt_info");
+    if (engine_opt_info.count(engine_key)) {
+      trt_engine_serialized_data = engine_opt_info[engine_key];
+    }
+  }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);

  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
  if (enable_int8 && calibration_data.size() != 0) {
@@ -230,10 +241,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  // When in int8 mode and calibration_mode, the program just produce the
  // calibration table data.
  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
-  if (!calibration_mode && use_static_engine) {
+  if (!calibration_mode && use_static_engine &&
+      trt_engine_serialized_data.empty()) {
    std::copy(params.begin(), params.end(),
              std::back_inserter(*repetitive_params));
-    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+    trt_engine_serialized_data = GetTrtEngineSerializedData(
        Get<std::string>("model_opt_cache_dir"), engine_key);

    if (trt_engine_serialized_data.empty()) {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -64,8 +64,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
 endif()
-if (WITH_ANAKIN_SUBGRAPH)
-   inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) 
-   inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) 
-   inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) 
-endif()
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/gpu_info.h"

 namespace paddle {
+extern const std::vector<std::string> kAnakinSubgraphPasses;

 PassStrategy *AnalysisConfig::pass_builder() const {
  if (!pass_builder_.get()) {
@@ -230,6 +231,20 @@ void AnalysisConfig::Update() {
    }
  }

+  if (use_anakin_) {
+    PADDLE_ENFORCE(!use_tensorrt_,
+                   "Anakin sub-graph and TensorRT sub-graph are not allowed to "
+                   "run at the same time!");
+    PADDLE_ENFORCE(
+        use_gpu_,
+        "Anakin sub-graph engine need gpu, please use the EnableGpu API.");
+
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kAnakinSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@@ -266,7 +281,7 @@ std::string AnalysisConfig::SerializeInfoCache() {

  ss << specify_input_name_;
  ss << cpu_math_library_num_threads_;
-
+  ss << use_anakin_;
  return ss.str();
 }

@@ -316,6 +331,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
  Update();
 }

+void AnalysisConfig::SetEngineOptInfo(
+    std::map<std::string, std::string> engine_opt_info) {
+  engine_opt_info_ = engine_opt_info;
+}
+
 NativeConfig AnalysisConfig::ToNativeConfig() const {
  NativeConfig config;
  config.model_dir = model_dir_;
@@ -332,5 +352,8 @@ void AnalysisConfig::SwitchIrDebug(int x) {
  ir_debug_ = x;
  Update();
 }
-
+void AnalysisConfig::EnableAnakinEngine() {
+  use_anakin_ = true;
+  Update();
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -351,7 +351,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetStaticMemoryOptimForceUpdate(
      config_.static_memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
+  argument_.SetEngineOptInfo(config_.engine_opt_info_);
  // Analyze inference_program
+  argument_.SetUseAnakin(config_.anakin_engine_enabled());
+  argument_.SetPredictorID(predictor_id_);
  if (!config_.model_dir().empty()) {
    argument_.SetModelDir(config_.model_dir());
  } else {
@@ -375,6 +378,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
  }

+  if (config_.use_gpu() && config_.anakin_engine_enabled()) {
+    LOG(INFO) << "Anakin subgraph engine is enabled";
+  }
+
  if (config_.use_mkldnn_) {
    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
@@ -404,7 +411,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu()) {
    // 1. GPU memory
-    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                      config.gpu_device_id());
    std::vector<std::string> flags;

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,7 +45,9 @@ using framework::NaiveExecutor;
 */
 class AnalysisPredictor : public PaddlePredictor {
 public:
-  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+  }
  ~AnalysisPredictor();

  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
@@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor {
  const size_t max_shape_collect_count_{1000};
  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+  int predictor_id_;

 private:
  // Some status here that help to determine the status inside the predictor.

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -14,9 +14,11 @@
 #pragma once

 #include <cassert>
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>

 /*! \file */
@@ -140,6 +142,14 @@ struct AnalysisConfig {
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
+  /**
+   *  \brief Turn on the usage of Anakin sub-graph engine.
+   */
+  void EnableAnakinEngine();
+
+  /** A boolean state indicating whether the Anakin sub-graph engine is used.
+  */
+  bool anakin_engine_enabled() const { return use_anakin_; }

  /** \brief Control whether to debug IR graph analysis phase.
   *
@@ -185,6 +195,7 @@ struct AnalysisConfig {
  /** A boolean state telling whether the model is set from the CPU memory.
   */
  bool model_from_memory() const { return model_from_memory_; }
+  void SetEngineOptInfo(std::map<std::string, std::string> engine_opt_info);

  /** Turn on memory optimize
   * NOTE still in development, will release latter.
@@ -258,6 +269,8 @@ struct AnalysisConfig {
  std::string serialized_info_cache_;

  mutable std::unique_ptr<PassStrategy> pass_builder_;
+  bool use_anakin_{false};
+  std::map<std::string, std::string> engine_opt_info_;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -68,6 +68,17 @@ void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }

+// The following passes works for Anakin sub-graph engine.
+const std::vector<std::string> kAnakinSubgraphPasses({
+    "infer_clean_graph_pass",                   //
+    "simplify_anakin_detection_pattern_pass3",  //
+    "fc_fuse_pass",                             //
+    "conv_elementwise_add_fuse_pass",           //
+    "conv_bn_fuse_pass",                        //
+    "conv_elementwise_add_fuse_pass",           //
+    "anakin_subgraph_pass",
+});
+
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
    "infer_clean_graph_pass",                        //
@@ -120,4 +131,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
  });
  use_gpu_ = false;
 }
+void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,7 @@ class PaddlePassBuilder {
  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);

+  void ClearPasses();
  /** Append an analysis pass. */
  void AppendAnalysisPass(const std::string &pass);

@@ -142,4 +143,6 @@ class GpuPassStrategy : public PassStrategy {
  virtual ~GpuPassStrategy() = default;
 };

+extern const std::vector<std::string> kAnakinSubgraphPasses;
+
 }  // namespace paddle
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #ifdef PADDLE_WITH_CUDA

+#include <fstream>
 #include <map>
 #include <memory>
 #include <string>
@@ -52,8 +53,9 @@ class AnakinEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> input_names_;
  std::unordered_set<std::string> param_names_;
-  mutable std::unique_ptr<AnakinNvEngineT> anakin_engine_;
+  mutable AnakinNvEngineT *anakin_engine_;
  std::string engine_key_;
+  std::string engine_serialized_data_;

 public:
  AnakinEngineOp(const std::string &type,
@@ -67,6 +69,7 @@ class AnakinEngineOp : public framework::OperatorBase {
    for (const auto &param : params) {
      param_names_.insert(param);
    }
+    anakin_engine_ = nullptr;
  }

 protected:
@@ -77,12 +80,12 @@ class AnakinEngineOp : public framework::OperatorBase {

  void RunAnakin(const framework::Scope &scope,
                 const platform::Place &dev_place) const {
-    if (anakin_engine_.get() == nullptr) {
-      anakin_engine_.reset(new AnakinEngine<NV, Precision::FP32>(true));
-      Prepare(scope, dev_place, anakin_engine_.get());
-    }
+    auto *engine = GetEngine(scope, dev_place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();

-    auto *engine = anakin_engine_.get();
    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");

    std::vector<std::string> output_maps =
@@ -95,24 +98,48 @@ class AnakinEngineOp : public framework::OperatorBase {
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
      auto t_shape = framework::vectorize(t.dims());
+      auto *anakin_input = engine->Net()->get_in(x);
+      auto net_shape = anakin_input->shape();
+      size_t anakin_net_input_size = net_shape.count() * sizeof(float);
+      size_t fluid_input_size = t.memory_size();
+
+      if (fluid_input_size < anakin_net_input_size) {
+        framework::LoDTensor temp_t;
+        auto t_dims = t.dims();
+        temp_t.Resize(t_dims);
+        TensorCopySync(t, dev_place, &temp_t);
+        t.Resize(framework::make_ddim(net_shape));
+        t.mutable_data<float>(dev_place);
+        TensorCopySync(temp_t, dev_place, &t);
+      }
      inputs.insert({x, &t});
    }

    std::map<std::string, framework::LoDTensor *> outputs;
    int output_index = 0;
    for (const auto &y : Outputs("Ys")) {
-      std::vector<int> ddim =
-          engine->Net()->get_out(output_maps[output_index])->valid_shape();
+      // std::vector<int> ddim =
+      //    engine->Net()->get_out(output_maps[output_index])->valid_shape();
      // we need get the output anakin output shape.
      auto *fluid_v = scope.FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      fluid_t->Resize(framework::make_ddim(ddim));
-      fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
+      // fluid_t->Resize(framework::make_ddim(ddim));
+      // fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
      outputs.insert({output_maps[output_index], fluid_t});
      output_index += 1;
    }
-    engine->Execute(inputs, outputs);
+    engine->Execute(inputs, outputs, stream);
+  }
+
+  AnakinNvEngineT *GetEngine(const framework::Scope &scope,
+                             const platform::Place &dev_place) const {
+    if (anakin_engine_ == nullptr) {
+      anakin_engine_ =
+          inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
+              .Get(engine_key_);
+    }
+    return anakin_engine_;
  }

  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
@@ -128,8 +155,6 @@ class AnakinEngineOp : public framework::OperatorBase {
    inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
        .ConvertBlock(block_desc, param_names_, scope, engine);
    engine->Freeze();
-    engine->Optimize();
-
    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      auto &t =
@@ -142,6 +167,9 @@ class AnakinEngineOp : public framework::OperatorBase {
      }
      engine->SetInputShape(x, t_shape);
    }
+
+    engine->Optimize();
+
    engine->InitGraph();
  }
 };