Update DlNNE engine (#45027)

* add config param for enable_dlnne and support calibration mode * remove useless file * refine code and add annotation * refine code of Warnning tips

Update DlNNE engine (#45027)
* add config param for enable_dlnne and support calibration mode * remove useless file * refine code and add annotation * refine code of Warnning tips
638965c5 · denglin-github · GitHub · d7d9807e · 638965c5 · 638965c5
13 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -250,6 +250,22 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_use_static_batch, DlnneUseStaticBatch, bool);
+  DECL_ARGUMENT_FIELD(dlnne_weight_share_mode,
+                      DlnneWeightShareMode,
+                      std::string);
+  DECL_ARGUMENT_FIELD(dlnne_disable_nodes_by_outputs,
+                      DlnneDisableNodesByOutputs,
+                      std::unordered_set<std::string>);
+  DECL_ARGUMENT_FIELD(dlnne_use_calib_mode, DlnneUseCalibMode, bool);
+  DECL_ARGUMENT_FIELD(dlnne_precision_mode,
+                      DlnnePrecisionMode,
+                      AnalysisConfig::Precision);
+  using dlnne_input_shape_type = std::map<std::string, std::vector<int64_t>>;
+  DECL_ARGUMENT_FIELD(dlnne_input_shape_dict,
+                      DlnneInputShapeDict,
+                      dlnne_input_shape_type);
  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(lite_passes_filter,

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -209,8 +209,23 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("disable_trt_plugin_fp16",
                new bool(argument->disable_trt_plugin_fp16()));
    } else if (pass_name == "dlnne_subgraph_pass") {
+      auto precision_mode = argument->dlnne_precision_mode();
      pass->Set("min_subgraph_size",
                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("max_batch_size", new int(argument->dlnne_max_batch_size()));
+      pass->Set("use_static_batch",
+                new bool(argument->dlnne_use_static_batch()));
+      pass->Set("weight_share_mode",
+                new std::string(argument->dlnne_weight_share_mode()));
+      pass->Set("disable_nodes_by_outputs",
+                new std::unordered_set<std::string>(
+                    argument->dlnne_disable_nodes_by_outputs()));
+      pass->Set("use_calib_mode", new bool(argument->dlnne_use_calib_mode()));
+      pass->Set("precision_mode",
+                new AnalysisConfig::Precision(precision_mode));
+      pass->Set("input_shape_dict",
+                new std::map<std::string, std::vector<int64_t>>(
+                    argument->dlnne_input_shape_dict()));
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
    }

--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-namespace paddle {
-namespace inference {
-int RegisterPyFunc(const std::string& name, void* pfn);
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -11,87 +11,339 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <set>
+#include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 #include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
+namespace analysis {
-int (*PyConvertGraph)(const char *graph_name);
+using framework::ir::Node;
-int RegisterPyFunc(const std::string &name, void *pfn) {
+void analysis::DlnneSubgraphPass::InferShapeForDlnneMainGraph() const {
-  if (name.compare("convert_graph") == 0) {
+  // copy from paddle2onnx
-    PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
+  static std::unordered_set<std::string> OP_WITHOUT_KERNEL_SET = {
+      "feed",
+      "fetch",
+      "recurrent",
+      "go",
+      "rnn_memory_helper_grad",
+      "conditional_block",
+      "while",
+      "send",
+      "recv",
+      "listen_and_serv",
+      "fl_listen_and_serv",
+      "ncclInit",
+      "select",
+      "checkpoint_notify",
+      "gen_bkcl_id",
+      "c_gen_bkcl_id",
+      "gen_nccl_id",
+      "c_gen_nccl_id",
+      "c_comm_init",
+      "c_sync_calc_stream",
+      "c_sync_comm_stream",
+      "queue_generator",
+      "dequeue",
+      "enqueue",
+      "heter_listen_and_serv",
+      "c_wait_comm",
+      "c_wait_compute"};
+  std::string bilinear_interp_v2_type = "bilinear_interp_v2";
+  auto input_dict =
+      Get<std::map<std::string, std::vector<int64_t>>>("input_shape_dict");
+  framework::ProgramDesc *global_program =
+      Get<framework::ProgramDesc *>("program");
+  auto block = global_program->MutableBlock(framework::kRootBlockIndex);
+  for (auto kv : input_dict) {
+    auto var = block->FindVar(kv.first);
+    if (var != nullptr) {
+      var->SetShape(kv.second);
+    } else {
+      VLOG(4) << "input_name:" << kv.first << " not find in all input vars";
+    }
  }
-  return 0;
+  std::vector<framework::OpDesc *> all_ops = block->AllOps();
-}
-int ConvertGraph(std::string graph_name) {
+  for (size_t i = 0; i < block->OpSize(); i++) {
-  LOG(INFO) << "starting doing convert_graph";
+    // the output_shape of bilinear_interp_v2 cannot be inferd by input shape,
+    // it also need the value of input tensor, so when call OpDesc->InferShape,
+    // the output_shape of bilinear_interp_v2 is still dynamic, here we try to
+    // infer the output_shape of bilinear_interp_v2 infer shape for
+    // bilinear_interp_v2
+    if (block->Op(i)->Type() == bilinear_interp_v2_type) {
+      framework::VariableNameMap input_name_map = block->Op(i)->Inputs();
+      std::vector<std::string> input_name_vec = input_name_map["OutSize"];
+      PADDLE_ENFORCE_EQ(
+          input_name_vec.size(),
+          1,
+          platform::errors::PreconditionNotMet(
+              "The 'bilinear_interp_v2 op' input 'OutSize' size must be 1 "));
+      // find shape->slice->bilinear_interp_v2 pattern
+      int start_id = 0;
+      int end_id = 0;
+      std::vector<std::string> slice_input_name_vec;
+      for (auto *i_op : all_ops) {
+        if (i_op->HasOutput("Out")) {
+          auto it = find(i_op->Output("Out").begin(),
+                         i_op->Output("Out").end(),
+                         input_name_vec[0]);
+          if (it != i_op->Output("Out").end()) {
+            slice_input_name_vec = i_op->Input("Input");
+            PADDLE_ENFORCE_EQ(
+                slice_input_name_vec.size(),
+                1,
+                platform::errors::PreconditionNotMet(
+                    "The 'slice op' input 'Input' size must be 1 "));
+            auto start_vec = i_op->GetAttrIfExists<std::vector<int>>("starts");
+            start_id = start_vec[0];
+            auto end_vec = i_op->GetAttrIfExists<std::vector<int>>("ends");
+            end_id = end_vec[0];
+            break;
+          }
+        }
+      }
-  PyConvertGraph(graph_name.c_str());
+      std::vector<std::string> shape_input_name_vec;
+      for (auto *i_op : all_ops) {
+        if (i_op->HasOutput("Out")) {
+          auto it = find(i_op->Output("Out").begin(),
+                         i_op->Output("Out").end(),
+                         slice_input_name_vec[0]);
+          if (it != i_op->Output("Out").end()) {
+            shape_input_name_vec = i_op->Input("Input");
+            PADDLE_ENFORCE_EQ(
+                slice_input_name_vec.size(),
+                1,
+                platform::errors::PreconditionNotMet(
+                    "The 'shape op' input 'Input' size must be 1 "));
+            break;
+          }
+        }
+      }
+      auto target_var = block->FindVarRecursive(shape_input_name_vec[0]);
+      std::vector<int64_t> target_shape = target_var->GetShape();
+      size_t target_shape_len = target_shape.size();
+      if (start_id < 0) {
+        start_id = target_shape_len + start_id;
+      } else if (start_id > static_cast<int>(target_shape_len)) {
+        start_id = target_shape_len;
+      }
-  return 0;
+      if (end_id < 0) {
-}
+        end_id = target_shape_len + end_id;
+      } else if (end_id > static_cast<int>(target_shape_len)) {
+        end_id = target_shape_len;
+      }
-namespace analysis {
+      if (start_id < end_id) {
+        std::vector<int64_t> OutSize_dims(target_shape.begin() + start_id,
+                                          target_shape.begin() + end_id);
+        framework::VariableNameMap output_name_map = block->Op(i)->Outputs();
+        std::vector<std::string> output_name_vec = output_name_map["Out"];
+        auto out_var = block->FindVarRecursive(output_name_vec[0]);
+        PADDLE_ENFORCE_NOT_NULL(
+            out_var,
+            platform::errors::NotFound(
+                "bilinear_interp_v2 op's output %s is not found in the block.",
+                output_name_vec[0]));
+        std::vector<int64_t> ori_shape = out_var->GetShape();
+        std::string data_layout =
+            block->Op(i)->GetAttrIfExists<std::string>("data_layout");
+        size_t start_dim = 0;
+        size_t end_dim = 0;
+        if (data_layout == "NCHW") {
+          start_dim = 2;
+          end_dim = ori_shape.size();
+        } else {
+          start_dim = 1;
+          end_dim = ori_shape.size() - 1;
+        }
+        for (size_t i_dim = start_dim; i_dim < end_dim; i_dim++) {
+          ori_shape[i_dim] = OutSize_dims[i_dim - start_dim];
+        }
-using framework::ir::Node;
+        VLOG(4) << "Set bilinear_interp_v2 shape: " << ori_shape[2] << ", "
+                << ori_shape[3];
+        out_var->SetShape(ori_shape);
+      }
+    } else {
+      if (OP_WITHOUT_KERNEL_SET.find(block->Op(i)->Type()) ==
+          OP_WITHOUT_KERNEL_SET.end())
+        block->Op(i)->InferShape(*block);
+    }
+  }
+}
+bool analysis::DlnneSubgraphPass::IsDynamicOp(std::string var_name,
+                                              bool use_static_batch) const {
+  framework::ProgramDesc *global_program =
+      Get<framework::ProgramDesc *>("program");
+  auto block = global_program->MutableBlock(framework::kRootBlockIndex);
+  auto var = block->FindVar(var_name);
+  if (var != nullptr) {
+    std::vector<int64_t> var_shape = var->GetShape();
+    size_t start_idx = use_static_batch ? 1 : 0;
+    for (; start_idx < var_shape.size(); start_idx++) {
+      if (var_shape[start_idx] < 1) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
 void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
+  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+  InferShapeForDlnneMainGraph();
  static std::unordered_set<std::string> teller_set{
+      "nearest_interp_v2",
      "mul",
      "matmul",
+      "matmul_v2",
+      "flatten_contiguous_range",
      "conv2d",
      "pool2d",
      "relu",
      "softmax",
      "sigmoid",
+      "softplus",
      "hard_swish",
+      "hard_sigmoid",
      "depthwise_conv2d",
      "batch_norm",
+      "exp",
      "concat",
+      "clip",
+      "cast",
      "tanh",
      "pad",
      "elementwise_add",
      "elementwise_mul",
+      "elementwise_sub",
+      "elementwise_div",
+      "elementwise_pow",
      "dropout",
+      // "deformable_conv",
      "prelu",
      "conv2d_transpose",
      "leaky_relu",
-      // "fc",
+      "log",
+      "fc",
      "shuffle_channel",
      "swish",
      "split",
-      // "instance_norm",
+      "instance_norm",
      "gelu",
-      // "layer_norm",
+      "layer_norm",
-      // "scale",
+      "scale",
-      // "stack",
+      "slice",
+      "stack",
      "relu6",
      "reshape2",
      "transpose2",
      "concat",
      "slice",
+      "fill_constant",
+      "fill_constant_batch_size_like",
+      "shape",
+      "unsqueeze2",
+      "pad3d",
+      "squeeze2",
+      "bilinear_interp_v2"
+      // "yolo_box"
  };
-  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+  // the op which output is special, need special process
+  static std::unordered_set<std::string> special_output_op_set{
+      "transpose2",
+      "fill_constant_batch_size_like",
+      "flatten_contiguous_range",
+      "batch_norm",
+      "unsqueeze2",
+  };
+  // the op when it's shape is dynamic still can be fused by
+  // dlnne_engine_op
+  static std::unordered_set<std::string> dynamic_pass_op_set{
+      "reshape2",
+  };
+  auto disable_nodes_by_outputs =
+      Get<std::unordered_set<std::string>>("disable_nodes_by_outputs");
+  bool use_static_batch = Get<bool>("use_static_batch");
  auto teller = [&](const framework::ir::Node *node) {
-    if (!node->IsOp() || !node->Op()) return false;
+    if (!node->IsOp() || !node->Op()) {
-    return teller_set.find(node->Op()->Type()) != teller_set.end();
+      return false;
+    }
+    if (teller_set.find(node->Op()->Type()) == teller_set.end()) {
+      VLOG(4) << "don't support op:" << node->Op()->Type();
+      return false;
+    } else {
+      bool flag = true;
+      // check node output
+      if (dynamic_pass_op_set.find(node->Op()->Type()) !=
+          dynamic_pass_op_set.end()) {
+        flag = true;
+      } else if (special_output_op_set.find(node->Op()->Type()) ==
+                 special_output_op_set.end()) {
+        for (auto *x : node->outputs) {
+          std::string var_name = x->Var()->Name();
+          flag = IsDynamicOp(var_name, use_static_batch);
+          if (!flag) break;
+        }
+      } else {
+        std::string var_name = node->outputs[0]->Var()->Name();
+        flag = IsDynamicOp(var_name, use_static_batch);
+      }
+      // check node input
+      if (flag) {
+        for (auto *x : node->inputs) {
+          std::string var_name = x->Var()->Name();
+          flag = IsDynamicOp(var_name, use_static_batch);
+          if (!flag) break;
+        }
+      }
+      if (!flag) {
+        VLOG(4) << "don't support dynamic shape:" << node->Op()->Type();
+      }
+      bool flag2 = true;
+      for (auto *x : node->outputs) {
+        if (disable_nodes_by_outputs.find(x->Name()) !=
+            disable_nodes_by_outputs.end()) {
+          flag2 = false;
+        }
+      }
+      if (!flag2) {
+        VLOG(4) << "user don't use " << node->Name() << "...";
+      }
+      return flag && flag2;
+    }
  };
  framework::ir::SubGraphFuser fuser(
@@ -153,6 +405,45 @@ std::string replace_name(std::string name,
  return r_name;
 }
+auto fix_batch_as_one(
+    std::unordered_map<std::string, framework::VarDesc *> *name_var_desc,
+    std::set<std::string> *valid_input_names,
+    bool use_static_batch = false) {
+  std::unordered_map<std::string, std::vector<int64_t>> name_var_shape;
+  if (use_static_batch) {
+    std::set<std::string> names;
+    names.insert(valid_input_names->begin(), valid_input_names->end());
+    for (auto name : names) {
+      if (name_var_desc->find(name) != name_var_desc->end()) {
+        auto var_desc = (*name_var_desc)[name];
+        auto sp = var_desc->GetShape();
+        if (sp[0] == -1) {
+          sp[0] = 1;
+          name_var_shape[name] = sp;
+          std::stringstream sp_str;
+          copy(sp.begin(),
+               sp.end(),
+               std::ostream_iterator<int64_t>(sp_str, ","));
+          LOG(INFO)
+              << "Warning: fix var:" << name << " batch,shape is ["
+              << sp_str.str()
+              << "],we assume subgraph's inputs/outputs first dim is batch,"
+              << "but when the first dim is not mean batch "
+              << "we suggest you use fix shape model....";
+        }
+      }
+    }
+  }
+  return name_var_shape;
+}
+/*
+there are two ProgramDesc in the function, global_program is used for generate a
+Dlnne op, dump_program is used for dump the subgraph to onnx subgraph which is
+loaded by Dlnne op
+*/
 void DlnneSubgraphPass::CreateDlnneOp(
    framework::ir::Node *node,
    framework::ir::Graph *graph,
@@ -172,22 +463,58 @@ void DlnneSubgraphPass::CreateDlnneOp(
  block_desc.Proto()->set_idx(0);
  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
  // for debug
-  framework::ProgramDesc tmp_dump_program_desc;
+  framework::ProgramDesc *global_program =
-  auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
+      Get<framework::ProgramDesc *>("program");
+  const framework::BlockDesc &main_block =
+      global_program->Block(framework::kRootBlockIndex);
-  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
+  std::set<std::string> input_names;
-  std::set<std::string> name_var_input_nodes;
+  std::set<std::string> input_names_with_id;
-  std::set<std::string> name_var_output_nodes;
+  std::vector<std::string> params;
-  std::set<std::string> name_ops;
+  std::set<std::string> valid_input_names;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+  // The node->inputs contains input tensors and parameters.
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+    if (std::find(graph_params.begin(), graph_params.end(), x->Name()) ==
+        graph_params.end()) {
+      valid_input_names.insert(x->Name());
+    }
+  }
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  std::vector<int> origin_output_dims;
+  std::set<std::string> valid_output_names;
+  for (auto *x : node->outputs) {
+    origin_output_dims.push_back(x->Var()->GetShape().size());
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::find(graph_params.begin(), graph_params.end(), x->Name()) ==
+        graph_params.end()) {
+      valid_output_names.insert(x->Name());
+    }
+  }
+  auto *child_block = global_program->AppendBlock(main_block);
+  framework::ProgramDesc dump_program;
+  auto *export_block = dump_program.MutableBlock(framework::kRootBlockIndex);
+  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
  for (auto *node : subgraph) {
    auto *op = block_desc.AppendOp();
    *op->Proto() = *node->Op()->Proto();
+    auto *child_op = child_block->AppendOp();
-    // debug
+    *child_op->Proto() = *node->Op()->Proto();
+    // generate op by node to append on block
    {
-      name_ops.insert(node->Name());
+      auto *export_op = export_block->AppendOp();
-      auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
      framework::OpDesc op_desc;
      op_desc.CopyFrom(*node->Op());
@@ -204,77 +531,69 @@ void DlnneSubgraphPass::CreateDlnneOp(
          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
        }
      }
-      *tmp_dump_new_block_op->Proto() = *op_desc.Proto();
+      *export_op->Proto() = *op_desc.Proto();
      for (auto *x : node->inputs) {
        if (x->IsVar()) {
-          name_var_desc[x->Name()] = x->Var();
+          auto var_desc_infer = main_block.FindVarRecursive(x->Name());
+          if (var_desc_infer != nullptr) {
+            name_var_desc[x->Name()] = var_desc_infer;
+          } else {
+            name_var_desc[x->Name()] = x->Var();
+          }
        }
-        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
-            0)
-          name_var_input_nodes.insert(x->Name());
      }
      for (auto *x : node->outputs) {
        if (x->IsVar()) {
-          name_var_desc[x->Name()] = x->Var();
+          auto var_desc_infer = main_block.FindVarRecursive(x->Name());
+          if (var_desc_infer != nullptr) {
+            name_var_desc[x->Name()] = var_desc_infer;
+          } else {
+            name_var_desc[x->Name()] = x->Var();
+          }
        }
-        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
-            0)
-          name_var_output_nodes.insert(x->Name());
      }
    }
  }
-  std::set<std::string> valid_input_names;
-  std::set<std::string> valid_output_names;
-  for (auto name : name_var_output_nodes) {
-    if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
-      valid_output_names.insert(name);
-    }
-  }
-  for (auto name : name_var_input_nodes) {
+  // starting fix bath as one
-    if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+  bool use_static_batch = Get<bool>("use_static_batch");
-      valid_input_names.insert(name);
+  auto name_shape_table =
-    }
+      fix_batch_as_one(*name_var_desc, *valid_input_names, use_static_batch);
+  for (const auto &name_shape : name_shape_table) {
+    VLOG(4) << "Fix batch shape as one var name: " << name_shape.first;
  }
  // Then, we will use the input_names_with_id and output_names_with_id to
  // generate the engine key.
  // So, We use set instead of unordered_set here to ensure that the engine key
  // is unique.
-  std::set<std::string> input_names;
+  auto engine_key = GenerateEngineKey(
-  std::set<std::string> input_names_with_id;
+      input_names_with_id, output_names_with_id, std::to_string(0));
-  std::vector<std::string> params;
+  auto precision_mode = Get<AnalysisConfig::Precision>("precision_mode");
-  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  bool enable_int8 = false;
-  // problem, so we filter them out.
+  if (precision_mode == AnalysisConfig::Precision::kInt8) {
+    enable_int8 = true;
-  // The node->inputs contains input tensors and parameters.
-  for (auto *x : node->inputs) {
-    input_names.insert(x->Name());
-    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
-      params.push_back(x->Name());
-    }
  }
+  auto use_calib_mode = Get<bool>("use_calib_mode");
-  std::set<std::string> output_names;
-  std::set<std::string> output_names_with_id;
+  std::string calibration_data_path = "./calibration/dlnne_calib_" + engine_key;
-  std::vector<int> origin_output_dims;
+  bool calibration_mode = false;
-  for (auto *x : node->outputs) {
+  if (enable_int8 && use_calib_mode && !PathExists(calibration_data_path)) {
-    origin_output_dims.push_back(x->Var()->GetShape().size());
+    calibration_mode = true;
-    output_names.insert(x->Name());
+    MKDIR("./calibration");
-    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    MKDIR(calibration_data_path.c_str());
  }
+  VLOG(4) << "calibration_mode: " << calibration_mode;
-  std::unordered_map<std::string, std::string> output_name_map;
+  std::stringstream ss;
-  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+  ss << "engine_key:" << engine_key << " outputs:[";
+  for (auto name : valid_output_names) {
-  for (framework::ir::Node *node : graph->Nodes()) {
+    ss << name << ",";
-    if (node->IsVar() && node->Var()) {
-      graph_var_map[node->Name()] = node;
-    }
  }
+  ss << "]";
+  VLOG(4) << ss.str();
  // Set attrs
  op_desc->SetType("dlnne_engine");
@@ -285,70 +604,98 @@ void DlnneSubgraphPass::CreateDlnneOp(
  op_desc->SetOutput("Ys",
                     std::vector<std::string>(valid_output_names.begin(),
                                              valid_output_names.end()));
+  op_desc->SetBlockAttr("sub_block", child_block);
  op_desc->SetAttr("parameters", params);
-  auto engine_key = GenerateEngineKey(
-      input_names_with_id, output_names_with_id, std::to_string(0));
  op_desc->SetAttr("engine_key", engine_key);
-  auto *scope = param_scope();
+  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
+  op_desc->SetAttr("use_static_batch", Get<bool>("use_static_batch"));
-  {
+  op_desc->SetAttr("weight_share_mode", Get<std::string>("weight_share_mode"));
-    std::set<std::string> input_names;
+  op_desc->SetAttr("enable_int8", enable_int8);
+  op_desc->SetAttr("use_calib_mode", use_calib_mode);
+  op_desc->SetAttr("calibration_mode", calibration_mode);
+  op_desc->SetAttr("calibration_data_path", calibration_data_path);
+  std::string subgraph_root_path = "./dump/" + engine_key;
+  op_desc->SetAttr("subgraph_root_path", subgraph_root_path);
+  std::stringstream ins_stream;
+  for (auto name : valid_input_names) {
+    ins_stream << "," << name;
+  }
+  op_desc->SetAttr("valid_input_names", ins_stream.str().substr(1));
-    for (auto name : name_var_input_nodes) {
+  std::stringstream outs_stream;
-      if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+  for (auto name : valid_output_names) {
-        input_names.insert(name);
+    outs_stream << "," << name;
-      }
+  }
-    }
+  op_desc->SetAttr("valid_output_names", outs_stream.str().substr(1));
+  auto *scope = param_scope();
+  {
    // add feed to subgraph:
    int input_idx = 0;
-    for (auto input_name : input_names) {
+    for (auto input_name : valid_input_names) {
-      auto *feed0 = tmp_dump_main_block->AppendOp();
+      auto *feed1 = export_block->AppendOp();
-      feed0->SetType("feed");
+      feed1->SetType("feed");
-      feed0->SetInput("X", {"feed"});
+      feed1->SetInput("X", {"feed"});
-      feed0->SetOutput("Out", {input_name});
+      feed1->SetOutput("Out", {input_name});
-      feed0->SetAttr("col", input_idx);
+      feed1->SetAttr("col", input_idx);
      input_idx++;
    }
    // add fetch to subgraph:
    int output_idx = 0;
    for (auto output_name : valid_output_names) {
-      auto *fetch0 = tmp_dump_main_block->AppendOp();
+      auto *fetch1 = export_block->AppendOp();
-      fetch0->SetType("fetch");
+      fetch1->SetType("fetch");
-      fetch0->SetInput("X", {output_name});
+      fetch1->SetInput("X", {output_name});
-      fetch0->SetOutput("Out", {"out"});
+      fetch1->SetOutput("Out", {"out"});
-      fetch0->SetAttr("col", output_idx);
+      fetch1->SetAttr("col", output_idx);
      output_idx++;
    }
-    mkdir("./dump", 0777);
-    std::string dir_name = "./dump/" + engine_key;
-    mkdir(dir_name.c_str(), 0777);
-    ofstream m_stream;
-    m_stream.open(dir_name + "/__model__", ios::out);
    VLOG(4) << "name_var_desc size:" << name_var_desc.size();
    for (auto &kv : name_var_desc) {
-      auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
+      auto *new_add_var1 = export_block->Proto()->add_vars();
-      *new_add_var = *kv.second->Proto();
+      paddle::framework::VarDesc copy_var_desc(*(kv.second->Proto()));
-      auto *variable_tmp = scope->FindVar(kv.first);
-      if (variable_tmp != nullptr) {
+      if (name_shape_table.find(kv.first) != name_shape_table.end()) {
-        *new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
+        copy_var_desc.SetShape(name_shape_table[kv.first]);
-        new_add_var->set_persistable(true);
+      }
+      *new_add_var1 = *(copy_var_desc.Proto());
+      auto *variable_tmp1 = scope->FindVar(kv.first);
+      if (variable_tmp1 != nullptr) {
+        *new_add_var1->mutable_name() = replace_name(kv.first, "/", ".");
+        new_add_var1->set_persistable(true);
      } else {
-        new_add_var->set_persistable(false);
+        new_add_var1->set_persistable(false);
      }
    }
+    std::string model_str;
+    dump_program.Proto()->SerializeToString(&model_str);
+    op_desc->SetAttr("subgraph", model_str);
+    op_desc->Flush();
+    if (calibration_mode) {
+      return;
+    }
+    MKDIR("./dump");
+    MKDIR(subgraph_root_path.c_str());
+    std::ofstream m_stream;
+    m_stream.open(subgraph_root_path + "/__model__", std::ios::out);
    for (auto param_name : params) {
      auto *var = scope->FindVar(param_name);
      if (var != nullptr) {
        auto *var_t = var->GetMutable<framework::LoDTensor>();
-        ofstream p_stream;
+        std::ofstream p_stream;
-        p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
+        p_stream.open(
-                      ios::out);
+            subgraph_root_path + "/" + replace_name(param_name, "/", "."),
+            std::ios::out);
        platform::DeviceContextPool &pool =
            platform::DeviceContextPool::Instance();
        auto &dev_ctx = *pool.Get(var_t->place());
@@ -357,17 +704,8 @@ void DlnneSubgraphPass::CreateDlnneOp(
      }
    }
-    std::string model;
+    m_stream << model_str;
-    tmp_dump_program_desc.Proto()->SerializeToString(&model);
-    m_stream << model;
    m_stream.close();
-    op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
-    op_desc->SetAttr("subgraph", model);
-    op_desc->Flush();
-    ConvertGraph(engine_key);
  }
 }

--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -34,9 +34,6 @@ class Node;
 namespace paddle {
 namespace inference {
-int ConvertGraph(std::string graph_name);
 namespace analysis {
 class DlnneSubgraphPass : public framework::ir::FusePassBase {
@@ -44,6 +41,8 @@ class DlnneSubgraphPass : public framework::ir::FusePassBase {
  void ApplyImpl(framework::ir::Graph *graph) const override;
 private:
+  void InferShapeForDlnneMainGraph() const;
+  bool IsDynamicOp(std::string var_name, bool use_static_batch) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
  void CreateDlnneOp(framework::ir::Node *x,
                     framework::ir::Graph *graph,

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -283,6 +283,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // Dlnne related
  CP_MEMBER(use_dlnne_);
  CP_MEMBER(dlnne_min_subgraph_size_);
+  CP_MEMBER(dlnne_max_batchsize_);
+  CP_MEMBER(dlnne_use_static_batch_);
+  CP_MEMBER(dlnne_weight_share_mode_);
+  CP_MEMBER(dlnne_use_calib_mode_);
+  CP_MEMBER(dlnne_precision_mode_);
+  CP_MEMBER(dlnne_disable_nodes_by_outputs_);
+  CP_MEMBER(dlnne_input_shape_dict_);
  // MKLDNN related.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
@@ -544,9 +551,24 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
-void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+void AnalysisConfig::EnableDlnne(
+    int min_subgraph_size,
+    int max_batch_size,
+    bool use_static_batch,
+    std::string weight_share_mode,
+    std::unordered_set<std::string> disable_nodes_by_ouputs,
+    std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
+    bool use_calib_mode,
+    AnalysisConfig::Precision precision_mode) {
  use_dlnne_ = true;
  dlnne_min_subgraph_size_ = min_subgraph_size;
+  dlnne_max_batchsize_ = max_batch_size;
+  dlnne_use_static_batch_ = use_static_batch;
+  dlnne_weight_share_mode_ = weight_share_mode;
+  dlnne_disable_nodes_by_outputs_ = disable_nodes_by_ouputs;
+  dlnne_input_shape_dict_ = dlnne_input_shape_dict;
+  dlnne_use_calib_mode_ = use_calib_mode;
+  dlnne_precision_mode_ = precision_mode;
  Update();
 }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1107,6 +1107,14 @@ void AnalysisPredictor::PrepareArgument() {
    LOG(INFO) << "Dlnne subgraph is enabled";
    argument_.SetUseDlnne(true);
    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+    argument_.SetDlnneMaxBatchSize(config_.dlnne_max_batchsize_);
+    argument_.SetDlnneUseStaticBatch(config_.dlnne_use_static_batch_);
+    argument_.SetDlnneWeightShareMode(config_.dlnne_weight_share_mode_);
+    argument_.SetDlnneDisableNodesByOutputs(
+        config_.dlnne_disable_nodes_by_outputs_);
+    argument_.SetDlnneInputShapeDict(config_.dlnne_input_shape_dict_);
+    argument_.SetDlnneUseCalibMode(config_.dlnne_use_calib_mode_);
+    argument_.SetDlnnePrecisionMode(config_.dlnne_precision_mode_);
  }
  if (config_.lite_engine_enabled()) {

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -663,7 +663,15 @@ struct PD_INFER_DECL AnalysisConfig {
  void EnableTensorRtInspector();
  bool tensorrt_inspector_enabled() { return trt_use_inspector_; }
-  void EnableDlnne(int min_subgraph_size = 3);
+  void EnableDlnne(
+      int min_subgraph_size = 3,
+      int max_batch_size = 1,
+      bool use_static_batch = false,
+      std::string weight_share_mode = "0",
+      std::unordered_set<std::string> disable_nodes_by_outputs = {},
+      std::map<std::string, std::vector<int64_t>> input_dict = {},
+      bool use_calib_mode = false,
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32);
  bool dlnne_enabled() const { return use_dlnne_; }
  ///
@@ -1006,6 +1014,13 @@ struct PD_INFER_DECL AnalysisConfig {
  // dlnne related.
  bool use_dlnne_{false};
  int dlnne_min_subgraph_size_{3};
+  int dlnne_max_batchsize_{1};
+  std::unordered_set<std::string> dlnne_disable_nodes_by_outputs_;
+  bool dlnne_use_static_batch_{true};
+  std::string dlnne_weight_share_mode_;
+  std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict_{};
+  bool dlnne_use_calib_mode_{false};
+  Precision dlnne_precision_mode_{Precision::kFloat32};
  // memory reuse related.
  bool enable_memory_optim_{false};

--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -269,12 +269,28 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
  return config->config.tensorrt_engine_enabled();
 }
-void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
+void PD_EnableDlnne(
-  PADDLE_ENFORCE_NOT_NULL(
+    PD_AnalysisConfig* config,
-      config,
+    int min_subgraph_size,
-      paddle::platform::errors::InvalidArgument(
+    int max_batch_size,
-          "The pointer of analysis configuration shouldn't be nullptr"));
+    bool use_static_batch,
-  config->config.EnableDlnne(min_subgraph_size);
+    std::string weight_share_mode,
+    std::unordered_set<std::string> disable_nodes_by_ouputs,
+    std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
+    bool use_calib_mode,
+    AnalysisConfig::Precision precision_mode) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size,
+                             max_batch_size,
+                             use_static_batch,
+                             weight_share_mode,
+                             disable_nodes_by_ouputs,
+                             dlnne_input_shape_dict,
+                             use_calib_mode,
+                             precision_mode);
 }
 bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {

--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -9,21 +9,19 @@ endforeach()
 # add nne
 find_path(
  DLNNE_INCLUDE_DIR dlnne.h
-  PATHS $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/include/dlnne
  NO_DEFAULT_PATH)
 find_library(
  DLNNE_LIB libdlnne.so
-  PATHS $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
  NO_DEFAULT_PATH)
-find_path(CUDA_INCLUDE_DIR cuda.h
+find_path(CUDA_INCLUDE_DIR cuda.h $ENV{DL_SDK_DIR}/include)
-          $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include)
 find_library(
  CURT_LIB libcurt.so
-  PATHS $ENV{SOFTWARE_BUILD_DIR}
+  PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
-        $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
  NO_DEFAULT_PATH)
 message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})

--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -28,6 +28,105 @@ void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
  cudaDeviceSynchronize();
 }
+std::string ConvertType(paddle::experimental::DataType type) {
+  switch (type) {
+    case paddle::experimental::DataType::FLOAT32: {
+      return "float32";
+    }
+    case paddle::experimental::DataType::INT64: {
+      return "int64";
+    }
+    case paddle::experimental::DataType::INT32: {
+      return "int32";
+    }
+    case paddle::experimental::DataType::FLOAT16: {
+      return "float16";
+    }
+    default: {
+      PADDLE_THROW(
+          platform::errors::Fatal("The DLNNE Calibration only support "
+                                  "float/float16/int32_t/int64_t input."));
+    }
+  }
+}
+int GetDataByte(paddle::experimental::DataType type) {
+  switch (type) {
+    case paddle::experimental::DataType::FLOAT32: {
+      return 4;
+    }
+    case paddle::experimental::DataType::INT64: {
+      return 8;
+    }
+    case paddle::experimental::DataType::INT32: {
+      return 4;
+    }
+    case paddle::experimental::DataType::FLOAT16: {
+      return 2;
+    }
+    default: {
+      PADDLE_THROW(
+          platform::errors::Fatal("The DLNNE Calibration only support "
+                                  "float/float16/int32_t/int64_t input."));
+    }
+  }
+}
+std::string GenerateRandomKey() {
+  std::string str(
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+  std::random_device rd;
+  std::mt19937 generator(rd());
+  std::shuffle(str.begin(), str.end(), generator);
+  return str.substr(0, 32);
+}
+void ConvertPaddle2Onnx(std::string onnx_file_name,
+                        std::string subgraph_root_path) {
+  if (!FileExists(onnx_file_name.c_str())) {
+    std::stringstream convert_cmd;
+    convert_cmd << "paddle2onnx --model_dir " << subgraph_root_path
+                << " --save_file " << onnx_file_name << " --opset_version 11";
+    LOG(INFO) << convert_cmd.str();
+    int convert_flag = system(convert_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(
+        convert_flag,
+        0,
+        platform::errors::Unavailable("Convert paddle to onnx failed"));
+  }
+}
+void QuantizeOnnx(std::string onnx_file_name,
+                  std::string rlym_file_name,
+                  std::string quantized_rlym_file_name,
+                  std::string dataset_path,
+                  std::string dataset_plugin_path) {
+  if (!FileExists(rlym_file_name.c_str())) {
+    std::stringstream convert_cmd;
+    convert_cmd << "python -m dl convert " << onnx_file_name
+                << " --output-model " << rlym_file_name;
+    LOG(INFO) << convert_cmd.str();
+    int convert_flag = system(convert_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(
+        convert_flag,
+        0,
+        platform::errors::Unavailable("Convert onnx to rlym failed"));
+  }
+  if (!FileExists(quantized_rlym_file_name.c_str())) {
+    std::stringstream quantize_cmd;
+    quantize_cmd << "python -m dl quantize "
+                 << "--dataset " << dataset_path << " --plugin "
+                 << dataset_plugin_path << " " << rlym_file_name;
+    LOG(INFO) << quantize_cmd.str();
+    int quantize_flag = system(quantize_cmd.str().c_str());
+    PADDLE_ENFORCE_EQ(quantize_flag,
+                      0,
+                      platform::errors::Unavailable("quantize model failed"));
+  }
+}
 }  // namespace inference
 namespace operators {
@@ -41,7 +140,23 @@ class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::string>(
        "engine_key",
        "The engine_key here is used to distinguish different DLNNE Engines");
-    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddAttr<int32_t>("max_batch_size", "engine max_batch_size");
+    AddAttr<bool>("use_static_batch", "static batch fix for [?,H,W,C]");
+    AddAttr<std::string>("weight_share_mode",
+                         "dlnne weight_share_mode, can be '0', '1', '2', '3', "
+                         "'01', '23', '0123' ");
+    // when use_calib_mode is true and enable_int8 is true,
+    // the calibration_runtime start,
+    // when calibration_mode is true, the calibration_runtiime
+    // go to the first stage of calibration, and when finish
+    // fisrt stage, the calibration_mode is set false, the
+    // calibration_runtime go to the second stage
+    AddAttr<bool>("use_calib_mode", "dlnne use calib mode");
+    AddAttr<bool>("enable_int8", "dlnne enable int8");
+    AddAttr<bool>("calibration_mode", "dlnne calibration_mode");
+    AddAttr<std::string>("calibration_data_path", "calibration data path");
+    AddAttr<std::string>("subgraph_root_path", "subgraph root path");
+    AddAttr<framework::BlockDesc*>("sub_block", "the dlnne block");
    AddComment("Dlnne engine operator.");
  }
 };

--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -13,25 +13,38 @@
 // limitations under the License.
 #pragma once
-#include <assert.h>
 #include <cuda.h>          // NOTLINT
 #include <cuda_runtime.h>  // NOTLINT
 #include <dlnne.h>         // NOTLINT
+#include <assert.h>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <mutex>
+#include <random>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
 namespace dl {
 namespace nne {
@@ -40,6 +53,31 @@ class Engine;
 class Network;
 class Parser;
 class ExecutionContext;
+inline unsigned int GetElementSize(DataType type) {
+  switch (type) {
+    case DataType::kINT64:
+    case DataType::kUINT64:
+    case DataType::kFLOAT64:
+      return 8;
+    case DataType::kINT32:
+    case DataType::kUINT32:
+    case DataType::kFLOAT32:
+      return 4;
+    case DataType::kINT16:
+    case DataType::kUINT16:
+    case DataType::kFLOAT16:
+      return 2;
+    case DataType::kINT8:
+    case DataType::kUINT8:
+    case DataType::kBOOL:
+      return 1;
+    case DataType::kUNKNOWN_TYPE:
+      return 0;
+  }
+  return 0;
+}
 }  // namespace nne
 }  // namespace dl
@@ -61,8 +99,45 @@ void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
 void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
-template <typename T>
+std::string ConvertType(paddle::experimental::DataType type);
-struct Singleton;
+int GetDataByte(paddle::experimental::DataType type);
+std::string GenerateRandomKey();
+void ConvertPaddle2Onnx(std::string onnx_file_name,
+                        std::string subgraph_root_path);
+void QuantizeOnnx(std::string onnx_file_name,
+                  std::string rlym_file_name,
+                  std::string quantized_rlym_file_name,
+                  std::string dataset_path,
+                  std::string dataset_plugin_path);
+static paddle::experimental::DataType DLNNE2FluidDataType(
+    dl::nne::DataType type) {
+  switch (type) {
+    case dl::nne::DataType::kFLOAT32:
+      return paddle::experimental::DataType::FLOAT32;
+    case dl::nne::DataType::kINT32:
+      return paddle::experimental::DataType::INT32;
+    case dl::nne::DataType::kINT64:
+      return paddle::experimental::DataType::INT64;
+    case dl::nne::DataType::kFLOAT16:
+      return paddle::experimental::DataType::FLOAT16;
+    case dl::nne::DataType::kUINT8:
+      return paddle::experimental::DataType::UINT8;
+    case dl::nne::DataType::kINT8:
+      return paddle::experimental::DataType::INT8;
+    case dl::nne::DataType::kBOOL:
+      return paddle::experimental::DataType::BOOL;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unknown fluid datatype in Fluid op converter"));
+      return paddle::experimental::DataType::FLOAT32;
+  }
+}
 }  // namespace inference
 }  // namespace paddle
@@ -70,15 +145,26 @@ namespace paddle {
 namespace operators {
+std::mutex static dlnne_create_lock;
 class DlnneEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> input_names_;
  std::unordered_set<std::string> param_names_;
  std::string engine_key_;
+  bool use_static_batch_;
+  bool calibration_mode_;
+  std::string calibration_data_path_;
+  std::string subgraph_root_path_;
+  bool enable_int8_;
+  bool use_calib_mode_;
+  std::string weight_share_mode_;
+  int max_batch_size_;
  int num_inputs;
  int num_outputs;
-  std::vector<std::string> output_names;
+  // std::vector<std::string> output_names;
-  std::vector<std::string> input_names;
+  // std::vector<std::string> input_names;
  dl::nne::Builder *builder;
  dl::nne::Parser *parser;
@@ -89,6 +175,10 @@ class DlnneEngineOp : public framework::OperatorBase {
  unsigned int engine_input_size;
  std::vector<int> InputIndexToBindIndex_;
+  char *dump_flag_;
+  char *dlnne_log_flag_;
+  char *dl_sdk_dir_;
 public:
  DlnneEngineOp(const std::string &type,
                const framework::VariableNameMap &inputs,
@@ -97,81 +187,214 @@ class DlnneEngineOp : public framework::OperatorBase {
      : framework::OperatorBase(type, inputs, outputs, attrs) {
    input_names_ = Inputs("Xs");
    engine_key_ = Attr<std::string>("engine_key");
+    use_static_batch_ = Attr<bool>("use_static_batch");
+    max_batch_size_ = Attr<int32_t>("max_batch_size");
+    weight_share_mode_ = Attr<std::string>("weight_share_mode");
+    calibration_mode_ = Attr<bool>("calibration_mode");
+    calibration_data_path_ = Attr<std::string>("calibration_data_path");
+    subgraph_root_path_ = Attr<std::string>("subgraph_root_path");
+    enable_int8_ = Attr<bool>("enable_int8");
+    use_calib_mode_ = Attr<bool>("use_calib_mode");
+    // dump input/output buffer of dlnne engine
+    dump_flag_ = getenv("PADDLE_DUMP_DLNNE_BUFFER");
+    dlnne_log_flag_ = getenv("PADDLE_DLNNE_LOG");
+    dl_sdk_dir_ = getenv("DL_SDK_DIR");
    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
      param_names_.insert(param);
    }
-    num_inputs = 0;
+    std::vector<std::string> XsMap;
+    num_inputs = Inputs("Xs").size();
+    std::string valid_input_name_str = Attr<std::string>("valid_input_names");
    for (const auto &x : Inputs("Xs")) {
-      if (param_names_.count(x)) continue;
+      // input_names.push_back(x);
-      num_inputs += 1;
+      XsMap.push_back(
-      input_names.push_back(x);
+          valid_input_name_str.substr(0, valid_input_name_str.find(",")));
+      valid_input_name_str =
+          valid_input_name_str.substr(valid_input_name_str.find(",") + 1);
    }
+    std::vector<std::string> YsMap;
    num_outputs = Outputs("Ys").size();
+    std::string valid_output_name_str = Attr<std::string>("valid_output_names");
    for (const auto &y : Outputs("Ys")) {
-      VLOG(4) << "y: " << y << std::endl;
+      // output_names.push_back(y);
-      output_names.push_back(y);
+      YsMap.push_back(
+          valid_output_name_str.substr(0, valid_output_name_str.find(",")));
+      valid_output_name_str =
+          valid_output_name_str.substr(valid_output_name_str.find(",") + 1);
    }
-    // onnx path
+    // TODO(pei.jiang): add dlnne_engine manager to manage dlnne_engine
-    std::stringstream filename;
+    if (!calibration_mode_) {
-    std::string current_path = ".";
+      std::map<std::string, dl::nne::WeightShareMode> weight_share_map;
-    char *buffer;
+      weight_share_map.insert(
-    if ((buffer = getcwd(NULL, 0)) != NULL) {
+          std::make_pair("0", dl::nne::WeightShareMode::kSingle));
-      current_path = buffer;
+      weight_share_map.insert(
-    } else {
+          std::make_pair("1", dl::nne::WeightShareMode::kSingle));
-      current_path = ".";
+      weight_share_map.insert(
-    }
+          std::make_pair("2", dl::nne::WeightShareMode::kSingle));
-    filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
+      weight_share_map.insert(
-             << ".onnx";
+          std::make_pair("3", dl::nne::WeightShareMode::kSingle));
+      weight_share_map.insert(
-    builder = dl::nne::CreateInferBuilder();
+          std::make_pair("01", dl::nne::WeightShareMode::kShare2));
-    PADDLE_ENFORCE_NE(
+      weight_share_map.insert(
-        builder,
+          std::make_pair("23", dl::nne::WeightShareMode::kShare2));
-        nullptr,
+      weight_share_map.insert(
-        platform::errors::Unavailable("nne create builder failed"));
+          std::make_pair("0123", dl::nne::WeightShareMode::kShare4));
-    parser = dl::nne::CreateParser();
-    PADDLE_ENFORCE_NE(
+      std::map<std::string, dl::nne::ClusterConfig> cluster_config_map;
-        parser,
+      cluster_config_map.insert(
-        nullptr,
+          std::make_pair("0", dl::nne::ClusterConfig::kCluster0));
-        platform::errors::Unavailable("nne create parser failed"));
+      cluster_config_map.insert(
+          std::make_pair("1", dl::nne::ClusterConfig::kCluster1));
-    network = builder->CreateNetwork();
+      cluster_config_map.insert(
+          std::make_pair("2", dl::nne::ClusterConfig::kCluster2));
-    LOG(INFO) << "set output for dlnne";
+      cluster_config_map.insert(
-    for (std::string &output_op_name : output_names)
+          std::make_pair("3", dl::nne::ClusterConfig::kCluster3));
-      parser->RegisterOutput(output_op_name.c_str());
+      cluster_config_map.insert(
+          std::make_pair("01", dl::nne::ClusterConfig::kCluster01));
-    LOG(INFO) << "parser onnx for dlnne";
+      cluster_config_map.insert(
-    parser->Parse(filename.str().c_str(), *network);
+          std::make_pair("23", dl::nne::ClusterConfig::kCluster23));
+      cluster_config_map.insert(
-    LOG(INFO) << "build network";
+          std::make_pair("0123", dl::nne::ClusterConfig::kCluster0123));
-    engine = builder->BuildEngine(*network);
+      dl::nne::WeightShareMode mode = weight_share_map[weight_share_mode_];
-    // total size = input_size+output_size
+      dl::nne::ClusterConfig cluster_config =
-    engine_input_size = num_inputs + num_outputs;
+          cluster_config_map[weight_share_mode_];
-    for (std::string &input_name : input_names) {
+      if (dlnne_log_flag_) {
-      int BindIndex = engine->GetBindingIndex(input_name.c_str());
+        LOG(INFO) << "weight_share_mode: " << mode
-      InputIndexToBindIndex_.push_back(BindIndex);
+                  << " cluster_config: " << cluster_config;
-    }
+      }
-    for (std::string &output_name : output_names) {
+      std::string onnx_file_name =
-      int BindIndex = engine->GetBindingIndex(output_name.c_str());
+          subgraph_root_path_ + "/" + engine_key_ + ".onnx";
-      InputIndexToBindIndex_.push_back(BindIndex);
+      inference::ConvertPaddle2Onnx(onnx_file_name, subgraph_root_path_);
-    }
+      std::string rlym_file_name =
+          subgraph_root_path_ + "/" + engine_key_ + ".rlym";
+      // quantize don't support set quantized ouput model path now,
+      // the quantized model file is in current dir
+      std::string quantized_rlym_file_name = engine_key_ + ".quantized.rlym";
+      std::stringstream filename;
+      std::stringstream engine_file_name;
+      if (enable_int8_ && use_calib_mode_) {
+        std::string dataset_path = calibration_data_path_;
+        std::string cnt_dataset_path = dataset_path + "/" + input_names_[0];
+        std::stringstream dataset_plugin_path;
+        dataset_plugin_path << dl_sdk_dir_
+                            << "/python/dleol/quantize/plugin.py";
+        inference::QuantizeOnnx(onnx_file_name,
+                                rlym_file_name,
+                                quantized_rlym_file_name,
+                                dataset_path,
+                                dataset_plugin_path.str());
+        filename << quantized_rlym_file_name;
+        engine_file_name << subgraph_root_path_ << "/" << engine_key_
+                         << "_quantized"
+                         << "_ws_" << weight_share_mode_ << ".engine";
+      } else {
+        filename << onnx_file_name;
+        engine_file_name << subgraph_root_path_ << "/" << engine_key_ << "_ws_"
+                         << weight_share_mode_ << ".engine";
+      }
+      dlnne_create_lock.lock();
+      if (dlnne_log_flag_) {
+        LOG(INFO) << "EngineKey:" << engine_key_
+                  << " use_static_batch_:" << use_static_batch_
+                  << " max_batch_size_:" << max_batch_size_
+                  << " weight_share_mode_: " << weight_share_mode_;
+      }
+      builder = dl::nne::CreateInferBuilder();
+      PADDLE_ENFORCE_NE(
+          builder,
+          nullptr,
+          platform::errors::Unavailable("nne create builder failed"));
+      dl::nne::BuilderConfig builder_cfg;
+      builder_cfg.max_batch_size = max_batch_size_;
+      builder_cfg.ws_mode = weight_share_map[weight_share_mode_];
+      builder->SetBuilderConfig(builder_cfg);
+      network = builder->CreateNetwork();
+      parser = dl::nne::CreateParser();
+      PADDLE_ENFORCE_NE(
+          parser,
+          nullptr,
+          platform::errors::Unavailable("nne create parser failed"));
+      if (dlnne_log_flag_) {
+        LOG(INFO) << "set output for dlnne";
+      }
+      for (std::string &output_op_name : YsMap) {
+        parser->RegisterOutput(output_op_name.c_str());
+        if (dlnne_log_flag_) {
+          LOG(INFO) << output_op_name;
+        }
+      }
+      std::fstream engine_file;
+      engine_file.open(engine_file_name.str().c_str(), std::ios::in);
+      if (!engine_file) {
+        if (dlnne_log_flag_) {
+          LOG(INFO) << "parser model file for dlnne";
+        }
+        parser->Parse(filename.str().c_str(), *network);
+        if (dlnne_log_flag_) {
+          LOG(INFO) << "build network";
+        }
+        engine = builder->BuildEngine(*network);
+        auto memory = engine->Serialize();
+        std::ofstream out(engine_file_name.str().c_str(),
+                          std::ofstream::binary);
+        out.write(reinterpret_cast<char *>(memory->Data()), memory->Size());
+        out.close();
+        memory->Destroy();
+      } else {
+        engine_file.seekg(0, std::ios::end);
+        uint64_t length = static_cast<uint64_t>(engine_file.tellg());
+        engine_file.seekg(0, std::ios::beg);
+        char *slz_data = new char[length];
+        engine_file.read(slz_data, static_cast<int64_t>(length));
+        engine = dl::nne::Deserialize(slz_data, length);
+        delete[] slz_data;
+      }
-    // context
+      engine_input_size = num_inputs + num_outputs;
-    context = engine->CreateExecutionContext();
+      for (std::string &input_name : XsMap) {
+        int BindIndex = engine->GetBindingIndex(input_name.c_str());
+        InputIndexToBindIndex_.push_back(BindIndex);
+      }
+      for (std::string &output_name : YsMap) {
+        int BindIndex = engine->GetBindingIndex(output_name.c_str());
+        InputIndexToBindIndex_.push_back(BindIndex);
+      }
+      // context
+      context = engine->CreateExecutionContext(
+          cluster_config_map[weight_share_mode_]);
+      dlnne_create_lock.unlock();
+    }
  }
  ~DlnneEngineOp() {
-    network->Destroy();
+    if (!calibration_mode_) {
-    context->Destroy();
+      network->Destroy();
-    engine->Destroy();
+      context->Destroy();
-    parser->Destroy();
+      engine->Destroy();
-    builder->Destroy();
+      parser->Destroy();
+      builder->Destroy();
+    }
  }
 protected:
@@ -190,7 +413,42 @@ class DlnneEngineOp : public framework::OperatorBase {
    std::vector<int32_t> input_data_types(num_inputs);
    std::vector<int64_t> input_bytes(num_inputs);
+    dlnne_create_lock.lock();
    int index = 0;
+    int infer_batch = 1;
+    std::vector<int> vec_infer_batch;
+    // compute infer_batch
+    if (use_static_batch_) {
+      for (const auto &x : Inputs("Xs")) {
+        if (param_names_.count(x)) continue;
+        // convert input and copy to Dlnne engine's buffer
+        auto &t =
+            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+        auto t_shape = phi::vectorize<int64_t>(t.dims());
+        std::vector<int64_t> runtime_input_shape(t_shape.begin(),
+                                                 t_shape.end());
+        const int bind_index = index;
+        index++;
+        dl::nne::Dims in_dim = engine->GetBindingDimensions(bind_index);
+        int compute_batch = runtime_input_shape[0] / in_dim.d[0];
+        VLOG(4) << "compute batch: " << compute_batch;
+        vec_infer_batch.push_back(compute_batch);
+      }
+      int first_batch = vec_infer_batch[0];
+      for (auto batch : vec_infer_batch) {
+        PADDLE_ENFORCE_EQ(
+            first_batch,
+            batch,
+            platform::errors::Unavailable(
+                "compute infer_batchs is different from each other"));
+      }
+      infer_batch = first_batch;
+    }
+    index = 0;
    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      // convert input and copy to Dlnne engine's buffer
@@ -199,26 +457,33 @@ class DlnneEngineOp : public framework::OperatorBase {
      const int bind_index = index;
      index++;
-      int64_t data_bytes;
+      int64_t data_bytes, ele_num;
      int32_t dtype;
-      auto type = framework::TransToProtoVarType(t.dtype());
+      auto type = t.type();
      data_bytes = 1;
+      ele_num = 1;
      void *buffer = nullptr;
-      if (type == framework::proto::VarType::FP32) {
+      // TODO(pei.jiang): add more type
+      if (type == paddle::experimental::DataType::FLOAT32) {
        buffer = static_cast<void *>(t.data<float>());
        data_bytes = 4;
        dtype = 0;
-      } else if (type == framework::proto::VarType::INT64) {
+      } else if (type == paddle::experimental::DataType::INT64) {
        buffer = static_cast<void *>(t.data<int64_t>());
        data_bytes = 8;
        dtype = 1;
-      } else if (type == framework::proto::VarType::INT32) {
+      } else if (type == paddle::experimental::DataType::INT32) {
        buffer = static_cast<void *>(t.data<int32_t>());
        data_bytes = 4;
        dtype = 2;
+      } else if (type == paddle::experimental::DataType::FLOAT16) {
+        buffer = static_cast<void *>(t.data<paddle::platform::float16>());
+        data_bytes = 2;
+        dtype = 3;
      } else {
-        PADDLE_THROW(platform::errors::Fatal(
+        PADDLE_THROW(
-            "The DLNNE Engine OP only support float/int32_t/int64_t input."));
+            platform::errors::Fatal("The DLNNE Engine OP only support "
+                                    "float/int32_t/int64_t/float16 input."));
      }
      input_buffers[bind_index] = buffer;
@@ -226,6 +491,7 @@ class DlnneEngineOp : public framework::OperatorBase {
      std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
      for (auto &size : t_shape) {
        data_bytes = data_bytes * size;
+        ele_num = ele_num * size;
      }
      VLOG(4) << "buffers_size:" << data_bytes;
@@ -234,35 +500,59 @@ class DlnneEngineOp : public framework::OperatorBase {
      input_shapes[bind_index] = runtime_input_shape;
      input_data_types[bind_index] = dtype;
      input_bytes[bind_index] = data_bytes;
+      if (dump_flag_) {
+        std::stringstream dump_input_name;
+        dump_input_name << engine_key_ << "_input_" << bind_index << ".txt";
+        std::ofstream dump_input_file;
+        dump_input_file.open(dump_input_name.str());
+        for (int64_t i = 0; i < ele_num; i++) {
+          dump_input_file << static_cast<float *>(
+                                 cpu_input_buffers[bind_index])[i]
+                          << "\n";
+        }
+        dump_input_file << "\b";
+        dump_input_file.close();
+      }
    }
    // output shape
    std::vector<std::vector<int64_t>> out_shapes;
+    std::vector<dl::nne::DataType> out_types;
+    std::vector<int64_t> out_ele_nums;
    std::vector<int32_t> output_bytes;
    for (int i = 0; i < num_outputs; i++) {
-      int index = engine->GetBindingIndex(output_names[i].c_str());
+      int index = InputIndexToBindIndex_[i + num_inputs];
+      dl::nne::DataType out_type = engine->GetBindingDataType(index);
+      out_types.push_back(out_type);
      dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
      std::vector<int64_t> shape(out_dim.nbDims);
      for (int dim = 0; dim < out_dim.nbDims; dim++) {
-        shape[dim] = (out_dim.d[dim]);
+        if (use_static_batch_ && dim == 0) {
+          shape[dim] = (out_dim.d[dim]) * infer_batch;
+        } else {
+          shape[dim] = (out_dim.d[dim]);
+        }
      }
      out_shapes.push_back(shape);
-      int64_t data_bytes;
+      int64_t data_bytes, out_ele_num;
+      out_ele_num = 1;
      // float32
-      data_bytes = 4;
+      data_bytes = dl::nne::GetElementSize(out_type);
      for (auto &size : shape) {
        data_bytes = data_bytes * size;
+        out_ele_num = out_ele_num * size;
      }
      VLOG(4) << "data_bytes: " << data_bytes;
      output_bytes.push_back(data_bytes);
+      out_ele_nums.push_back(out_ele_num);
    }
    int bind_index = 0;
    std::vector<void *> cpu_output_buffers(num_outputs);
    std::vector<void *> output_buffers(num_outputs);
-    std::vector<int32_t> output_dtypes(num_outputs);
    for (const auto &y : Outputs("Ys")) {
      auto *fluid_v = scope.FindVar(y);
@@ -273,15 +563,19 @@ class DlnneEngineOp : public framework::OperatorBase {
      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
+      VLOG(4) << bind_index << ": out_shapes[bind_index] dim:"
+              << out_shapes[bind_index].size();
      fluid_t->Resize(phi::make_ddim(out_shapes[bind_index]));
-      int32_t dtype;
+      dl::nne::DataType dl_type = out_types[bind_index];
-      output_buffers[bind_index] = fluid_t->mutable_data<float>(dev_place);
+      if (dlnne_log_flag_) {
-      dtype = 0;
+        LOG(INFO) << "output type: " << dl_type;
+      }
+      output_buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data(
+          dev_place, inference::DLNNE2FluidDataType(dl_type)));
      cpu_output_buffers[bind_index] =
          output_buffers[bind_index];  // malloc(data_bytes);
-      output_dtypes[bind_index] = dtype;
      bind_index++;
    }
@@ -289,7 +583,9 @@ class DlnneEngineOp : public framework::OperatorBase {
    // set input_ptr
    for (unsigned int i = 0; i < engine_input_size; i++) {
-      if (InputIndexToBindIndex_[i] < 0) continue;
+      if (InputIndexToBindIndex_[i] < 0) {
+        continue;
+      }
      if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
        // copy cpu buffer to gpu buffer
@@ -308,7 +604,7 @@ class DlnneEngineOp : public framework::OperatorBase {
      } else {
        int64_t total_size;
-        total_size = output_bytes[i - input_names.size()];
+        total_size = output_bytes[i - input_names_.size()];
        VLOG(4) << "output_bytes: " << total_size;
        void *gpu_ptr;
        cudaMalloc(&gpu_ptr, total_size);
@@ -318,36 +614,142 @@ class DlnneEngineOp : public framework::OperatorBase {
    clock_t startTime, endTime;
    startTime = clock();
-    context->Execute(1, engine_input_ptr.data());
+    context->Execute(infer_batch, engine_input_ptr.data());
    endTime = clock();
-    double during_ms =
-        static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+    if (dlnne_log_flag_) {
-    LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+      double during_ms =
+          static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+      LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+    }
    bind_index = 0;
    for (unsigned int i = 0; i < engine_input_size; i++) {
      if (InputIndexToBindIndex_[i] < 0) continue;
-      if (i >= input_names.size()) {
+      if (i >= input_names_.size()) {
-        void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
+        void *cpu_ptr = cpu_output_buffers[i - input_names_.size()];
        int64_t size;
-        size = output_bytes[i - input_names.size()];
+        size = output_bytes[i - input_names_.size()];
        paddle::inference::CopyTensorDeviceToCpu(
            cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
-        // dtype: float32
-        int32_t dtypes;
-        dtypes = 0;
        cpu_output_buffers[bind_index] = cpu_ptr;
-        output_dtypes[bind_index] = dtypes;
+        if (dump_flag_) {
+          std::stringstream dump_output_name;
+          dump_output_name << engine_key_ << "_output_" << bind_index << ".txt";
+          std::ofstream dump_output_file;
+          dump_output_file.open(dump_output_name.str());
+          for (int64_t i = 0; i < out_ele_nums[bind_index]; i++) {
+            dump_output_file
+                << static_cast<float *>(cpu_output_buffers[bind_index])[i]
+                << "\n";
+          }
+          dump_output_file << "\b";
+          dump_output_file.close();
+        }
        bind_index++;
      }
      cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
    }
+    dlnne_create_lock.unlock();
+  }
+  void RunNativeImpl(const framework::Scope &scope,
+                     const platform::Place &dev_place) const {
+    VLOG(4) << "RunNativeImpl";
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>("sub_block");
+    auto *program = block->Program();
+    auto &current_scope = scope.NewScope();
+    auto ctx = executor.Prepare(*program, block->ID());
+    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
+  }
+  void RunCalibration(const framework::Scope &scope,
+                      const platform::Place &dev_place) const {
+    std::unordered_map<std::string, void *> calib_data_map;
+    std::unordered_map<std::string, std::vector<int64_t>> calib_data_shape_map;
+    std::unordered_map<std::string, std::string> calib_data_type_map;
+    std::unordered_map<std::string, int64_t> calib_buffer_size_map;
+    for (auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      calib_data_map.emplace(x, t.data());
+      // TODO(pei.jiang): refine this code, because when run dlnne create
+      // engine, there is same code
+      auto t_shape = phi::vectorize<int64_t>(t.dims());
+      std::vector<int64_t> input_shape(t_shape.begin(), t_shape.end());
+      calib_data_shape_map.emplace(x, input_shape);
+      std::string data_type = inference::ConvertType(t.type());
+      calib_data_type_map.emplace(x, data_type);
+      int data_bytes = inference::GetDataByte(t.type());
+      VLOG(4) << "input name: " << x << ", data_type: " << data_type;
+      VLOG(4) << "data shape: ";
+      int64_t buffer_size = data_bytes;
+      for (auto dim : input_shape) {
+        buffer_size *= dim;
+        VLOG(4) << dim;
+      }
+      VLOG(4) << "buffer_size: " << buffer_size;
+      calib_buffer_size_map.emplace(x, buffer_size);
+    }
+    std::string random_key = inference::GenerateRandomKey();
+    for (auto calib_data : calib_data_map) {
+      std::string input_name = calib_data.first;
+      std::string input_data_path = calibration_data_path_ + "/" + input_name;
+      MKDIR(input_data_path.c_str());
+      std::string input_data_item_path =
+          input_data_path + "/" + random_key + ".binary";
+      auto outfile = std::fstream(input_data_item_path.c_str(),
+                                  std::ios::out | std::ios::binary);
+      int64_t buffer_size = calib_buffer_size_map[input_name];
+      outfile.write(reinterpret_cast<char *>(calib_data.second), buffer_size);
+      outfile.close();
+    }
+    std::stringstream calib_config_ss;
+    calib_config_ss << "shape message: " << std::endl;
+    for (auto const &shape_item : calib_data_shape_map) {
+      calib_config_ss << shape_item.first << ":";
+      for (auto const &dim : shape_item.second) {
+        calib_config_ss << dim << " ";
+      }
+      calib_config_ss << std::endl;
+    }
+    calib_config_ss << "dtype message: " << std::endl;
+    for (auto const &dtype_item : calib_data_type_map) {
+      calib_config_ss << dtype_item.first << ":" << dtype_item.second
+                      << std::endl;
+    }
+    std::ofstream calib_config_file;
+    std::string calib_config_path =
+        calibration_data_path_ + "/calib_config.txt";
+    calib_config_file.open(calib_config_path);
+    calib_config_file << calib_config_ss.str();
+    calib_config_file.close();
+    RunNativeImpl(scope, dev_place);
  }
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
+    VLOG(4) << "calibration_mode_: " << calibration_mode_;
+    if (calibration_mode_ == true) {
+      VLOG(4) << "RunCalibration";
+      RunCalibration(scope, dev_place);
+      return;
+    }
    RunDlnneOnCreateEngine(scope, dev_place);
  }
 };

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -730,7 +730,16 @@ void BindAnalysisConfig(py::module *m) {
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("enable_dlnne",
           &AnalysisConfig::EnableDlnne,
-           py::arg("min_subgraph_size") = 3)
+           py::arg("min_subgraph_size") = 3,
+           py::arg("max_batch_size") = 1,
+           py::arg("use_static_batch") = false,
+           py::arg("weight_share_mode") = "0",
+           py::arg("disable_nodes_by_outputs") =
+               std::unordered_set<std::string>(),
+           py::arg("input_shape_dict") =
+               std::map<std::string, std::vector<int64_t>>(),
+           py::arg("use_calib_mode") = false,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
      .def("enable_lite_engine",
           &AnalysisConfig::EnableLiteEngine,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,