diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 6139ccb31378874f74fafcc4822993d89b218111..21d2a602001ad834943ac67eb4c81bd039f15b6c 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -249,7 +249,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
     subgraphes_[i] = graph->GetSubGraph(i);
     all_op_nodes_[i] = TopologySortOperations(*subgraphes_[i]);
     VLOG(4) << "subgraph " << i << " has " << all_op_nodes_[i].size()
-            << "op nodes";
+            << " op nodes";
     for (auto* var_node : subgraphes_[i]->Nodes()) {
       if (!var_node->IsVar()) continue;
 
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 8492db828a4edb96dffb94061b7fadafff98b696..2289543dd44f5b71c36434bd590c7951ae397157 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -64,10 +64,6 @@ void NaiveExecutor::Run() {
     VLOG(4) << std::this_thread::get_id() << " run "
             << op->DebugStringEx(scope_) << " on scope " << scope_;
     op->SetIsCalledByExecutor(false);
-#ifdef PADDLE_WITH_NVTX
-    platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
-                                platform::NvtxRangeColor::Green);
-#endif
 
     for (auto &func : input_hookfuncs_) {
       func(op.get(), scope_);
@@ -77,7 +73,14 @@ void NaiveExecutor::Run() {
       op->SetOutputHooks(output_hookfuncs_);
     }
 
+#ifdef PADDLE_WITH_NVTX
+    platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
+                                platform::NvtxRangeColor::Green);
+#endif
     op->Run(*scope_, place_);
+#ifdef PADDLE_WITH_NVTX
+    platform::CudaNvtxRangePop();
+#endif
 
     // Update the shared_holder so that only records the max one.
     if (reuse_cache_.count(op.get())) {
@@ -105,9 +108,6 @@ void NaiveExecutor::Run() {
       }
     }
 
-#ifdef PADDLE_WITH_NVTX
-    platform::CudaNvtxRangePop();
-#endif
     for (auto &func : output_hookfuncs_) {
       func(op.get(), scope_);
     }
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 2b827f87ada821623f9b1f0f348bec1aad6a921e..e5224b9fa5b415aaf3c1382de3ceafc8ee457f15 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,8 +14,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+
 #include <fcntl.h>
 #include <cstddef>
+#include <memory>
 #include <string>
 #include <unordered_set>
 
@@ -32,6 +34,7 @@
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -124,11 +127,6 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
     framework::ir::Graph *graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
 
-  static std::once_flag trt_plugin_registered;
-  std::call_once(trt_plugin_registered, []() {
-    tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
-  });
-
   auto model_precision =
       static_cast<phi::DataType>(Get<int>("model_precision"));
   if (model_precision == phi::DataType::BFLOAT16) {
@@ -291,7 +289,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   // Add new block for TensorRTEngineOP
   const framework::BlockDesc &main_block =
       program_desc->Block(framework::kRootBlockIndex);
-  // const framework::BlockDesc& main_block = program_desc->Block(0);
   framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
 
   // A fake block desc.
@@ -319,9 +316,9 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   // is unique.
   std::set<std::string> input_names;
   std::set<std::string> input_names_with_id;
-  std::vector<std::string> params;
-  // if we delete fluid copy of params shared by more than 1 ops, there will be
-  // problem, so we filter them out.
+  std::vector<std::string> parameters;
+  // if we delete fluid copy of parameters shared by more than 1 ops, there will
+  // be problem, so we filter them out.
   std::vector<std::string> params_not_shared;
 
   auto *scope = param_scope();
@@ -330,7 +327,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
     if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
-      params.push_back(x->Name());
+      parameters.push_back(x->Name());
     }
     if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 &&
         x->outputs.size() <= 1) {
@@ -340,33 +337,15 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
     // So we reserved a name for later use when casting INT64 -> INT32 or
     // FP64->FP32. We must check whether scope has had the same name var!
     if (x->Var()->GetDataType() == framework::proto::VarType::INT64) {
-      std::string tmp_name = x->Name() + "_cast_to_INT32";
       LOG(WARNING)
           << "tensorrt_subgraph's input named " << x->Name()
           << " having int64 dtype in pdmodel description, we will cast them to "
              "int32 dtype to feed them into paddle-trt.";
-      /*
-            PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
-                              nullptr,
-                              platform::errors::InvalidArgument(
-                                  "The  var name %s has exists in scope.",
-         tmp_name));
-      */
-      scope->Var(tmp_name);
     } else if (x->Var()->GetDataType() == framework::proto::VarType::FP64) {
-      std::string tmp_name = x->Name() + "_cast_to_FP32";
       LOG(WARNING) << "tensorrt_subgraph's input named " << x->Name()
                    << " having float64 dtype in pdmodel description, we will "
                       "cast them to "
                       "float32 dtype to feed them into paddle-trt.";
-      /*
-            PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
-                              nullptr,
-                              platform::errors::InvalidArgument(
-                                  "The  var name %s has exists in scope.",
-         tmp_name));
-      */
-      scope->Var(tmp_name);
     }
   }
 
@@ -412,10 +391,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       graph_var_map[node->Name()] = node;
     }
   }
-  auto precision_mode = Get<int>("trt_precision_mode");
+  auto precision_mode =
+      static_cast<phi::DataType>(Get<int>("trt_precision_mode"));
   bool enable_fp16 = false;
-  if (precision_mode == static_cast<int>(phi::DataType::FLOAT16))
-    enable_fp16 = true;
+  if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true;
   auto enable_int8 = Get<bool>("enable_int8");
   auto use_calib_mode = Get<bool>("use_calib_mode");
   auto &subgraph_nodes = *framework::ir::Agent(node).subgraph();
@@ -423,14 +402,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       Get<std::map<std::string, std::vector<int>>>("min_input_shape");
   auto max_input_shape =
       Get<std::map<std::string, std::vector<int>>>("max_input_shape");
-  auto opt_input_shape =
+  auto optim_input_shape =
       Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
 
   auto min_shape_tensor =
       Get<std::map<std::string, std::vector<int>>>("min_shape_tensor");
   auto max_shape_tensor =
       Get<std::map<std::string, std::vector<int>>>("max_shape_tensor");
-  auto opt_shape_tensor =
+  auto optim_shape_tensor =
       Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor");
 
   auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
@@ -444,10 +423,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                            &min_input_shape,
                                            &max_input_shape,
-                                           &opt_input_shape,
+                                           &optim_input_shape,
                                            &min_shape_tensor,
                                            &max_shape_tensor,
-                                           &opt_shape_tensor);
+                                           &optim_shape_tensor);
     } else {
       shape_range_info_path =
           Get<std::string>("model_opt_cache_dir") + "shape_range_info.pbtxt";
@@ -457,10 +436,10 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
         inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                              &min_input_shape,
                                              &max_input_shape,
-                                             &opt_input_shape,
+                                             &optim_input_shape,
                                              &min_shape_tensor,
                                              &max_shape_tensor,
-                                             &opt_shape_tensor);
+                                             &optim_shape_tensor);
       } else {
         int fd = open(shape_range_info_path.c_str(), O_WRONLY | O_CREAT, 0644);
         close(fd);
@@ -509,32 +488,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
     if (static_cast<framework::proto::VarType_Type>(
             map_origin_outputs_dtype[name]) ==
         framework::proto::VarType::INT64) {
-      std::string tmp_name = name + "_cast_to_INT64";
       LOG(WARNING) << "tensorrt_subgraph's output named " << name
                    << " having int64 dtype in pdmodel description, but in fact "
                       "it is int32 "
                       "dtype after executing this tensorrt_subgraph, so we "
                       "need cast them into int64.";
-      PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
-                        nullptr,
-                        platform::errors::InvalidArgument(
-                            "The  var name %s has exists in scope.", tmp_name));
-      scope->Var(tmp_name);
     } else if (static_cast<framework::proto::VarType_Type>(
                    map_origin_outputs_dtype[name]) ==
                framework::proto::VarType::FP64) {
-      std::string tmp_name = name + "_cast_to_FP64";
       LOG(WARNING)
           << "tensorrt_subgraph's output named " << name
           << " having float64 dtype in pdmodel description, but in fact "
              "it is float32 "
              "dtype after executing this tensorrt_subgraph, so we "
              "need cast them into float64.";
-      PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
-                        nullptr,
-                        platform::errors::InvalidArgument(
-                            "The  var name %s has exists in scope.", tmp_name));
-      scope->Var(tmp_name);
     }
   }
   PADDLE_ENFORCE_EQ(output_mapping.empty(),
@@ -546,30 +513,73 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       true,
       platform::errors::PreconditionNotMet("the block has no var-desc"));
 
-  // Set attrs
+  // Get pass attrs.
+  auto use_varseqlen = Get<bool>("use_varseqlen");
+  auto with_interleaved = Get<bool>("with_interleaved");
+  auto tensorrt_transformer_posid =
+      Get<std::string>("tensorrt_transformer_posid");
+  auto tensorrt_transformer_maskid =
+      Get<std::string>("tensorrt_transformer_maskid");
+  auto use_dla = Get<bool>("trt_use_dla");
+  auto dla_core = Get<int>("trt_dla_core");
+  auto use_inspector = Get<bool>("use_inspector");
+  auto disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
+  auto context_memory_sharing = Get<bool>("context_memory_sharing");
+  auto enable_low_precision_io = Get<bool>("enable_low_precision_io");
+  auto workspace_size = Get<int64_t>("workspace_size");
+  auto gpu_device_id = Get<int>("gpu_device_id");
+
+  // Set op's attrs.
   op_desc->SetType("tensorrt_engine");
   op_desc->SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-
   op_desc->SetOutput(
       "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-
   op_desc->SetBlockAttr("sub_block", new_block);
   op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
   op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype);
   op_desc->SetAttr("max_batch_size", max_batch_size);
-  op_desc->SetAttr("workspace_size", Get<int64_t>("workspace_size"));
-  op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
+  op_desc->SetAttr("workspace_size", workspace_size);
+  op_desc->SetAttr("gpu_device_id", gpu_device_id);
   op_desc->SetAttr("output_name_mapping", output_mapping);
   op_desc->SetAttr("origin_output_rank", renamed_output_rank);
-  op_desc->SetAttr("parameters", params);
+  op_desc->SetAttr("parameters", parameters);
   op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
   op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
-  op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
-  op_desc->SetAttr("model_precision", Get<int>("model_precision"));
+  op_desc->SetAttr("use_inspector", use_inspector);
   op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
-  op_desc->SetAttr("enable_low_precision_io",
-                   Get<bool>("enable_low_precision_io"));
+  op_desc->SetAttr("enable_low_precision_io", enable_low_precision_io);
+
+  if (!trt_tuned_dynamic_shape) {
+    std::vector<std::string> dynamic_shape_names;
+    std::vector<int> dynamic_shape_lens;
+    std::vector<int> min_input_shape_vector;
+    std::vector<int> max_input_shape_vector;
+    std::vector<int> opt_input_shape_vector;
+    for (const auto &it : min_input_shape) {
+      dynamic_shape_names.push_back(it.first);
+      dynamic_shape_lens.push_back(it.second.size());
+      for (const auto &value : it.second) {
+        min_input_shape_vector.push_back(value);
+      }
+    }
+    for (const auto &it : max_input_shape) {
+      for (const auto &value : it.second) {
+        max_input_shape_vector.push_back(value);
+      }
+    }
+    for (const auto &it : optim_input_shape) {
+      for (const auto &value : it.second) {
+        opt_input_shape_vector.push_back(value);
+      }
+    }
+
+    op_desc->SetAttr("dynamic_shape_names", dynamic_shape_names);
+    op_desc->SetAttr("dynamic_shape_lens", dynamic_shape_lens);
+    op_desc->SetAttr("min_input_shape_vector", min_input_shape_vector);
+    op_desc->SetAttr("max_input_shape_vector", max_input_shape_vector);
+    op_desc->SetAttr("opt_input_shape_vector", opt_input_shape_vector);
+  }
 
   // we record all inputs' shapes in attr to check if they are consistent
   // with the real inputs' shapes retrieved from scope when trt runs.
@@ -624,14 +634,20 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("engine_key", engine_key);
   op_desc->SetAttr("calibration_engine_key", calibration_engine_key);
   op_desc->SetAttr("predictor_id", predictor_id);
-
-  std::string trt_engine_serialized_data = "";
+  op_desc->SetAttr("use_varseqlen", use_varseqlen);
+  op_desc->SetAttr("with_interleaved", with_interleaved);
+  op_desc->SetAttr("use_dla", use_dla);
+  op_desc->SetAttr("dla_core", dla_core);
+  op_desc->SetAttr("disable_trt_plugin_fp16", disable_trt_plugin_fp16);
+  op_desc->SetAttr("context_memory_sharing", context_memory_sharing);
+  std::string trt_engine_serialized_data;
   op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data);
   op_desc->Flush();
 
   std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
   if (enable_int8 && !calibration_data.empty()) {
-    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
+    calibrator =
+        std::make_unique<tensorrt::TRTInt8Calibrator>(calibration_data);
     LOG(INFO) << "RUN Paddle TRT int8 calibration mode...";
   }
   // When in int8 mode and calibration_mode, the program just produce the
@@ -656,7 +672,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
                                "static shape mode instead.";
     min_input_shape = {};
     max_input_shape = {};
-    opt_input_shape = {};
+    optim_input_shape = {};
   }
 
   const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
@@ -677,42 +693,33 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
       framework::ir::Agent(node).subgraph()->end());
   framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
 
-  // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
-  // run fp16.
-  // When running fp16, the output accuracy of the model will be affected,
-  // closing the plugin fp16 may bring some improvement on accuracy.
-  bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
+  tensorrt::TensorRTEngine::ConstructionParams params;
+  params.max_batch_size = max_batch_size;
+  params.max_workspace_size = workspace_size;
+  params.calibrator = calibrator.get();
+  params.device_id = gpu_device_id;
+  params.with_dynamic_shape = with_dynamic_shape;
+  params.min_input_shape = min_input_shape;
+  params.max_input_shape = max_input_shape;
+  params.optim_input_shape = optim_input_shape;
+  params.min_shape_tensor = min_shape_tensor;
+  params.max_shape_tensor = max_shape_tensor;
+  params.optim_shape_tensor = optim_shape_tensor;
+  params.disable_trt_plugin_fp16 = disable_trt_plugin_fp16;
+  params.precision = precision_mode;
+  params.use_varseqlen = use_varseqlen;
+  params.use_dla = use_dla;
+  params.dla_core = dla_core;
+  params.with_interleaved = with_interleaved;
+  params.tensorrt_transformer_posid = tensorrt_transformer_posid;
+  params.tensorrt_transformer_maskid = tensorrt_transformer_maskid;
+  params.context_memory_sharing = context_memory_sharing;
+  params.use_inspector = use_inspector;
+  params.enable_low_precision_io = enable_low_precision_io;
+
   tensorrt::TensorRTEngine *trt_engine =
       inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  max_batch_size,
-                  Get<int64_t>("workspace_size"),
-                  static_cast<phi::DataType>(precision_mode),
-                  calibrator.get(),
-                  Get<int>("gpu_device_id"),
-                  with_dynamic_shape,
-                  min_input_shape,
-                  max_input_shape,
-                  opt_input_shape,
-                  min_shape_tensor,
-                  max_shape_tensor,
-                  opt_shape_tensor,
-                  disable_trt_plugin_fp16,
-                  static_cast<phi::DataType>(Get<int>("model_precision")));
-  trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));
-  trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
-  trt_engine->SetTransformerPosid(
-      Get<std::string>("tensorrt_transformer_posid"));
-  trt_engine->SetTransformerMaskid(
-      Get<std::string>("tensorrt_transformer_maskid"));
-  trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
-  trt_engine->SetDLACore(Get<int>("trt_dla_core"));
-  trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-  trt_engine->SetWithErnie(
-      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-      graph->Has(framework::ir::kMultiheadMatmulPass));
-  trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));
-  trt_engine->SetLowPrecisionIO(Get<bool>("enable_low_precision_io"));
+          .Create(engine_key + std::to_string(predictor_id), params);
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
@@ -749,13 +756,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
                "kernel etc). This process may cost a lot of time.";
 
   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
-  std::unordered_set<std::string> param_set(params.begin(), params.end());
+  std::unordered_set<std::string> parameters_set(parameters.begin(),
+                                                 parameters.end());
   inference::Singleton<inference::tensorrt::OpConverter>::Global()
       .ConvertBlockToTRTEngine(
           &block_desc_temp,
           *scope,
           std::vector<std::string>(input_names.begin(), input_names.end()),
-          param_set,
+          parameters_set,
           output_mapping,
           trt_engine);
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 60d94eae7a6e03395d43bb648076bc5e3c2f6a37..8106dfbb9e6aae1e3319cb9f7ee3a01d64d52c1f 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
 
+#include <memory>
 #include <string>
 
 #include "paddle/fluid/framework/executor.h"
@@ -63,8 +64,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         "set."));
   }
 
-  auto graph = std::unique_ptr<framework::ir::Graph>(
-      new framework::ir::Graph(argument->main_program()));
+  auto graph = std::make_unique<framework::ir::Graph>(argument->main_program());
   argument->SetMainGraph(graph.release());
   auto *scope_ptr = argument->scope_ptr();
   PADDLE_ENFORCE_NOT_NULL(scope_ptr,
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index 2cfba89a91c359e725e8d52c0e3263835de7c465..80326b243ad461d84cfd16abc8b198ebbf60560c 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -24,16 +24,6 @@ namespace inference {
 namespace analysis {
 
 void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
-  if (!argument->save_optimized_model()) {
-    LOG(WARNING) << "save_optim_cache_model is turned off, skip "
-                    "save_optimized_model_pass";
-    return;
-  }
-  if (!argument->enable_ir_optim()) {
-    LOG(WARNING) << "ir_optim is turned off, skip save_optimized_model_pass";
-    return;
-  }
-
   std::string model_opt_cache_dir = argument->optim_cache_dir();
   if (!model_opt_cache_dir.empty()) {
     if (!PathExists(model_opt_cache_dir)) {
@@ -55,9 +45,11 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
   auto* graph = argument->main_graph_ptr();
 
   framework::ProgramDesc optimized_program_desc;
+
   // NOTE(liuyuanle): If the following line of code is not added, an error
   // [SegmentFault] may occur!
   optimized_program_desc.CopyFrom(*argument->main_program().Proto());
+
   framework::ir::GraphToProgram(*graph, &optimized_program_desc);
 
   auto IsPersistable = [](const framework::VarDesc* var) {
@@ -133,11 +125,10 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
 }
 
 void SaveOptimizedModelPass::RunImpl(Argument* argument) {
-  // TODO(inference): Support trt.
-  if (argument->use_xpu() ||
-      (argument->use_gpu() && !argument->use_tensorrt())) {
-    SaveOptimizedModel(argument);
+  if (!argument->save_optimized_model() || !argument->enable_ir_optim()) {
+    return;
   }
+  SaveOptimizedModel(argument);
 }
 
 std::string SaveOptimizedModelPass::repr() const {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 037b4a32211a718b888f34f90b8aaa84fcc0b0db..78fb47c4b80532bae9d0469adf14d1bb8c794ea5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -641,7 +641,7 @@ bool AnalysisPredictor::PrepareProgram(
 }
 
 bool AnalysisPredictor::CreateExecutor() {
-  executor_.reset(new paddle::framework::NaiveExecutor(place_));
+  executor_ = std::make_unique<paddle::framework::NaiveExecutor>(place_);
   return true;
 }
 
@@ -1341,7 +1341,7 @@ bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
 void AnalysisPredictor::PrepareArgument() {
   VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
-  argument_.reset(new Argument);
+  argument_ = std::make_unique<Argument>();
   argument_->SetUseGPU(config_.use_gpu());
   argument_->SetUseCutlass(config_.use_cutlass_);
   argument_->SetUseFcPadding(config_.use_fc_padding());
@@ -1570,7 +1570,8 @@ void AnalysisPredictor::PrepareArgument() {
 
   if (!config_.ir_optim()) {
     argument_->SetEnableIrOptim(false);
-    if (config_.enable_gpu_mixed_) {
+    if (config_.enable_gpu_mixed_ &&
+        model_precision_ == phi::DataType::FLOAT32) {
       argument_->SetEnableIrOptim(true);
       pass_builder->ClearPasses();
       pass_builder->AppendPass("auto_mixed_precision_pass");
@@ -1886,6 +1887,10 @@ AnalysisPredictor::GetInputTypes() {
       input_type[name] = paddle_infer::DataType::UINT8;
     } else if (dtype == paddle::framework::proto::VarType::INT8) {
       input_type[name] = paddle_infer::DataType::INT8;
+    } else if (dtype == paddle::framework::proto::VarType::FP64) {
+      input_type[name] = paddle_infer::DataType::FLOAT64;
+    } else if (dtype == paddle::framework::proto::VarType::BOOL) {
+      input_type[name] = paddle_infer::DataType::BOOL;
     } else {
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unsupported data type `%s` when get input dtype ", dtype));
@@ -2609,7 +2614,7 @@ AnalysisPredictor::~AnalysisPredictor() {
 #ifdef PADDLE_WITH_TENSORRT
   if (config_.trt_engine_memory_sharing()) {
     inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-        .releaseContextMemory(predictor_id_);
+        .ReleaseContextMemory(predictor_id_);
   }
 #endif
 }
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index b07b20bb82121260a25ecd49b2287eca8cc37b6f..429bc89f0d90ea42366696dc26f9aad48f9158ca 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -167,7 +167,7 @@ class OpConverter {
                                         op_desc.Type()));
 
     it->SetEngine(engine);
-    engine->SetScope(scope);
+    engine->SetScope(&scope);
     it->SetBlockDesc(block);
     (*it)(op, scope, test_mode);
 
@@ -301,7 +301,7 @@ class OpConverter {
       nvinfer1::DataType in_dtype = FluidDataType2TRT(var->GetDataType());
       if (engine->precision() == phi::DataType::FLOAT16 &&
           in_dtype == nvinfer1::DataType::kFLOAT &&
-          engine->EnableLowPrecisionIO()) {
+          engine->LowPrecisionIOEnabled()) {
         in_dtype = nvinfer1::DataType::kHALF;
       }
 
@@ -360,7 +360,7 @@ class OpConverter {
       nvinfer1::DataType out_dtype = FluidDataType2TRT(var->GetDataType());
       if (engine->precision() == phi::DataType::FLOAT16 &&
           out_dtype == nvinfer1::DataType::kFLOAT &&
-          engine->EnableLowPrecisionIO()) {
+          engine->LowPrecisionIOEnabled()) {
         out_dtype = nvinfer1::DataType::kHALF;
       }
       engine->DeclareOutput(output, out_dtype);
@@ -470,7 +470,7 @@ class OpConverter {
       auto shape = newShape->getDimensions();
       shuffle->setReshapeDimensions(shape);
     }
-    if (name != "") {
+    if (!name.empty()) {
       shuffle->setName(name.c_str());
     }
     return shuffle->getOutput(0);
@@ -481,7 +481,7 @@ class OpConverter {
                              const std::string& name = "") {
     auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
     shuffle->setReshapeDimensions(shape);
-    if (name != "") {
+    if (!name.empty()) {
       shuffle->setName(name.c_str());
     }
     return shuffle->getOutput(0);
@@ -774,11 +774,6 @@ class OpConverter {
   bool test_mode_;
 
  private:
-  // registered op converter map, whose key is the fluid op type, and value is
-  // the pointer position of corresponding OpConverter class.
-  std::unordered_map<std::string, OpConverter*> converters_;
-  // fluid inference scope
-  framework::Scope* scope_{nullptr};
   std::mutex mut_;
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
index 278c647ca582c02e0b5c75e6d9fbb74acf7a97bb..4ab1f7ca3f2baeb65c0e5129d6db549bfbbde73c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>  // NOLINT
+#include <memory>
 
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
@@ -95,7 +96,11 @@ TEST(CustomPluginCreater, StaticShapePlugin) {
 
   // init trt engine
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15));
+
+  TensorRTEngine::ConstructionParams params;
+  params.max_batch_size = 5;
+  params.max_workspace_size = 1 << 15;
+  engine_ = std::make_unique<TensorRTEngine>(params);
   engine_->InitNetwork();
 
   engine_->DeclareInput(
@@ -173,15 +178,10 @@ TEST(CustomPluginCreater, DynamicShapePlugin) {
   std::map<std::string, std::vector<int>> optim_input_shape = {
       {"x", {1, 2, 5, 5}}};
 
-  engine_.reset(new TensorRTEngine(5,
-                                   1 << 15,
-                                   phi::DataType::FLOAT32,
-                                   nullptr,
-                                   0,
-                                   true,
-                                   min_input_shape,
-                                   max_input_shape,
-                                   optim_input_shape));
+  TensorRTEngine::ConstructionParams params;
+  params.max_batch_size = 5;
+  params.max_workspace_size = 1 << 15;
+  engine_ = std::make_unique<TensorRTEngine>(params);
   engine_->InitNetwork();
 
   LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape();
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 3a15af255e5bce4118a23120b1c5c9293817ed65..b653df0bca83bccb8c6460e506b17985c3240b88 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>  // NOLINT
 
+#include <memory>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
@@ -28,7 +30,10 @@ TEST(OpConverter, ConvertBlock) {
 
   // init trt engine
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15));
+  TensorRTEngine::ConstructionParams params;
+  params.max_batch_size = 5;
+  params.max_workspace_size = 1 << 15;
+  engine_ = std::make_unique<TensorRTEngine>(params);
   engine_->InitNetwork();
 
   engine_->DeclareInput(
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index ee6b1393c8aae59a3c291f6816ae0b7dd939a92a..8a41d564a09da3e4c73ebf2a6aff2b2c44f12690 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -88,7 +88,10 @@ class TRTConvertValidation {
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_),
                       0,
                       platform::errors::External("cudaStreamCreate error."));
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = max_batch_size;
+    params.max_workspace_size = workspace_size;
+    engine_ = std::make_unique<TensorRTEngine>(params);
     engine_->InitNetwork();
   }
 
@@ -155,7 +158,7 @@ class TRTConvertValidation {
     engine_->FreezeNetwork();
 
     // Declare outputs.
-    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+    op_desc_ = std::make_unique<framework::OpDesc>(desc, nullptr);
   }
 
   // We use the set 'neglected_output' here, because some Ops like batch norm,
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 6e9beb810f0bd067bd324d56ba35c9d6a9c4f27e..7803989aa38c5f00f8eb1071e6b0b11efe9a1bb8 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -21,17 +21,15 @@ limitations under the License. */
 
 #include "NvInferRuntimeCommon.h"
 #include "cuda_runtime_api.h"  // NOLINT
+
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-int TensorRTEngine::runtime_batch_ = 1;
 thread_local int TensorRTEngine::predictor_id_per_thread = -1;
 
 void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
@@ -64,10 +62,10 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
 }
 
 void TensorRTEngine::InitNetwork() {
-  freshDeviceId();
+  FreshDeviceId();
   infer_builder_.reset(createInferBuilder(&logger_));
 
-  if (with_dynamic_shape_) {
+  if (with_dynamic_shape()) {
     infer_network_.reset(infer_builder_->createNetworkV2(
         1U << static_cast<int>(
             nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
@@ -92,7 +90,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
     // IExecutionContext...
     // It's ok. We will set it later.
     nvinfer1::IExecutionContext *infer_context{nullptr};
-    if (context_memory_sharing_) {
+    if (params_.context_memory_sharing) {
       infer_context =
           infer_engine_->createExecutionContextWithoutDeviceMemory();
     } else {
@@ -102,7 +100,7 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
         infer_context,
         platform::errors::InvalidArgument(
             "TensorRT engine can not build execution context."));
-    if (with_dynamic_shape_) {
+    if (with_dynamic_shape()) {
       // need new profile if it's not the first
       if (cur_profile_num_ > 0) {
         infer_context->setOptimizationProfile(cur_profile_num_);
@@ -118,15 +116,15 @@ nvinfer1::IExecutionContext *TensorRTEngine::context() {
 void TensorRTEngine::Execute(int batch_size,
                              std::vector<void *> *buffers,
                              cudaStream_t stream) {
-  freshDeviceId();
+  FreshDeviceId();
   auto infer_context = context();
-  if (context_memory_sharing_) {
+  if (params_.context_memory_sharing) {
     void *context_memory{nullptr};
     context_memory =
         inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-            .getContextMemory(
+            .GetContextMemory(
                 predictor_id_per_thread,
-                phi::GPUPlace(device_id_),
+                phi::GPUPlace(device_id()),
                 phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
     infer_context->setDeviceMemory(context_memory);
   }
@@ -182,12 +180,11 @@ bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context,
   } else {
     ret = context->enqueueV2(buffers->data(), stream, nullptr);
   }
-  SetRuntimeBatch(batch_size);
   return ret;
 }
 
 void TensorRTEngine::FreezeNetwork() {
-  freshDeviceId();
+  FreshDeviceId();
   VLOG(3) << "TRT to freeze network";
   PADDLE_ENFORCE_NOT_NULL(infer_builder_,
                           platform::errors::InvalidArgument(
@@ -197,17 +194,17 @@ void TensorRTEngine::FreezeNetwork() {
                           platform::errors::InvalidArgument(
                               "Call InitNetwork first to initialize network."));
   // build engine.
-  if (!with_dynamic_shape_) {
-    infer_builder_->setMaxBatchSize(max_batch_);
+  if (!with_dynamic_shape()) {
+    infer_builder_->setMaxBatchSize(params_.max_batch_size);
   }
 #if IS_TRT_VERSION_GE(8300)
   infer_builder_config_->setMemoryPoolLimit(
-      nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_);
+      nvinfer1::MemoryPoolType::kWORKSPACE, params_.max_workspace_size);
 #else
-  infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
+  infer_builder_config_->setMaxWorkspaceSize(params_.max_workspace_size);
 #endif
 
-  bool enable_fp16 = (precision_ == phi::DataType::FLOAT16);
+  bool enable_fp16 = (precision() == phi::DataType::FLOAT16);
   if (enable_fp16) {
     bool support_fp16 = infer_builder_->platformHasFastFp16();
     infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
@@ -219,15 +216,15 @@ void TensorRTEngine::FreezeNetwork() {
     }
   }
 
-  bool enable_int8 = (precision_ == phi::DataType::INT8);
+  bool enable_int8 = (precision() == phi::DataType::INT8);
   if (enable_int8) {
-    if (!use_dla_) {
+    if (!use_dla()) {
       infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
     }
     infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
 
-    if (calibrator_) {
-      infer_builder_config_->setInt8Calibrator(calibrator_);
+    if (params_.calibrator) {
+      infer_builder_config_->setInt8Calibrator(params_.calibrator);
     } else {
       infer_builder_config_->setInt8Calibrator(nullptr);
 
@@ -259,7 +256,7 @@ void TensorRTEngine::FreezeNetwork() {
     }
   }
 
-  if (use_dla_) {
+  if (use_dla()) {
     if (!enable_int8 && !enable_fp16) {
       LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
                       "set float32, so DLA is not used.";
@@ -268,42 +265,43 @@ void TensorRTEngine::FreezeNetwork() {
           << "TensorRT DLA is set by config, but your device does not have "
              "DLA, so DLA is not used.";
     } else {
-      if (dla_core_ < 0 || dla_core_ >= infer_builder_->getNbDLACores()) {
-        dla_core_ = 0;
+      if (params_.dla_core < 0 ||
+          params_.dla_core >= infer_builder_->getNbDLACores()) {
+        params_.dla_core = 0;
         LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
                      << infer_builder_->getNbDLACores() << ", but got "
-                     << dla_core_ << ", so use use 0 as default.";
+                     << params_.dla_core << ", so use use 0 as default.";
       }
       infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-      infer_builder_config_->setDLACore(dla_core_);
+      infer_builder_config_->setDLACore(params_.dla_core);
       infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
       LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore "
-                << dla_core_;
+                << params_.dla_core;
     }
   }
 
-  if (with_dynamic_shape_) {
+  if (with_dynamic_shape()) {
     LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
     for (int i = 0; i < max_profile_num_; i++) {
-      for (auto &input : min_input_shape_) {
+      for (auto &input : min_input_shape()) {
 #if IS_TRT_VERSION_LT(7100)
         // trt6/trt7011 will check all_of input > 0
         if (!(std::all_of(input.second.begin(),
                           input.second.end(),
                           [](int x) { return x > 0; }) &&
-              std::all_of(max_input_shape_[input.first].begin(),
-                          max_input_shape_[input.first].end(),
+              std::all_of(max_input_shape()[input.first].begin(),
+                          max_input_shape()[input.first].end(),
                           [](int x) { return x > 0; }) &&
-              std::all_of(optim_input_shape_[input.first].begin(),
-                          optim_input_shape_[input.first].end(),
+              std::all_of(optim_input_shape()[input.first].begin(),
+                          optim_input_shape()[input.first].end(),
                           [](int x) { return x > 0; }))) {
           continue;
         }
 #endif
         VLOG(4) << "TRT dynamic_shape set " << input.first
                 << " min: " << Vec2Str(input.second)
-                << ", max: " << Vec2Str(max_input_shape_[input.first])
-                << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
+                << ", max: " << Vec2Str(max_input_shape()[input.first])
+                << ", opt: " << Vec2Str(optim_input_shape()[input.first]);
 
         optim_profiles_[i]->setDimensions(
             input.first.c_str(),
@@ -312,38 +310,39 @@ void TensorRTEngine::FreezeNetwork() {
         optim_profiles_[i]->setDimensions(
             input.first.c_str(),
             nvinfer1::OptProfileSelector::kMAX,
-            Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
+            Vec2TRT_Dims(max_input_shape()[input.first], input.first, true));
         optim_profiles_[i]->setDimensions(
             input.first.c_str(),
             nvinfer1::OptProfileSelector::kOPT,
-            Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
+            Vec2TRT_Dims(optim_input_shape()[input.first], input.first, true));
       }
 
       for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) {
         auto input_name = network()->getInput(input_id)->getName();
         if (!itensor_map_.count(input_name)) continue;
         if (!GetITensor(input_name)->isShapeTensor()) continue;
-        PADDLE_ENFORCE_EQ(min_shape_tensor_.count(input_name) &&
-                              max_shape_tensor_.count(input_name) &&
-                              optim_shape_tensor_.count(input_name),
+        PADDLE_ENFORCE_EQ(min_shape_tensor().count(input_name) > 0 &&
+                              max_shape_tensor().count(input_name) > 0 &&
+                              optim_shape_tensor().count(input_name) > 0,
                           true,
                           platform::errors::InvalidArgument(
                               "Fail to find min/max/optim shape value for TRT "
                               "network's shape tensor input named %s.",
                               input_name));
-        auto min_vec = min_shape_tensor_.at(input_name);
+        auto min_vec = min_shape_tensor().at(input_name);
         optim_profiles_[i]->setShapeValues(input_name,
                                            nvinfer1::OptProfileSelector::kMIN,
                                            min_vec.data(),
                                            min_vec.size());
-        optim_profiles_[i]->setShapeValues(input_name,
-                                           nvinfer1::OptProfileSelector::kMAX,
-                                           max_shape_tensor_[input_name].data(),
-                                           min_vec.size());
+        optim_profiles_[i]->setShapeValues(
+            input_name,
+            nvinfer1::OptProfileSelector::kMAX,
+            max_shape_tensor()[input_name].data(),
+            min_vec.size());
         optim_profiles_[i]->setShapeValues(
             input_name,
             nvinfer1::OptProfileSelector::kOPT,
-            optim_shape_tensor_[input_name].data(),
+            optim_shape_tensor()[input_name].data(),
             min_vec.size());
       }
 
@@ -358,7 +357,7 @@ void TensorRTEngine::FreezeNetwork() {
     }
   }
 #if IS_TRT_VERSION_GE(8200)
-  if (use_inspector_) {
+  if (params_.use_inspector) {
     infer_builder_config_->setProfilingVerbosity(
         nvinfer1::ProfilingVerbosity::kDETAILED);
   }
@@ -388,12 +387,12 @@ void TensorRTEngine::FreezeNetwork() {
     cur_profile_num_ = 0;
   }
   // for engine context memory sharing
-  if (context_memory_sharing_) {
+  if (params_.context_memory_sharing) {
     inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+        .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(),
                                  predictor_id_per_thread);
   }
-  if (use_inspector_) {
+  if (params_.use_inspector) {
     GetEngineInfo();
   }
 }
@@ -539,7 +538,7 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor(
   }
   // In fact , this is not always right, because we can't determine if the 0th
   // dimension is batch. Just for run chenqu's model
-  if (!this->with_dynamic_shape()) {
+  if (!with_dynamic_shape()) {
     trt_in_shape.nbDims--;
     for (int i = 0; i < trt_in_shape.nbDims; i++) {
       trt_in_shape.d[i] = trt_in_shape.d[i + 1];
@@ -563,12 +562,12 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
 }
 
 void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
-  freshDeviceId();
+  FreshDeviceId();
   infer_runtime_.reset(createInferRuntime(&logger_));
 
-  if (use_dla_) {
-    if (precision_ != phi::DataType::INT8 &&
-        precision_ != phi::DataType::FLOAT16) {
+  if (use_dla()) {
+    if (precision() != phi::DataType::INT8 &&
+        precision() != phi::DataType::FLOAT16) {
       LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
                       "set float32, so DLA is not used.";
     } else if (infer_runtime_->getNbDLACores() == 0) {
@@ -576,15 +575,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
           << "TensorRT DLA is set by config, but your device does not have "
              "DLA, so DLA is not used.";
     } else {
-      if (dla_core_ < 0 || dla_core_ >= infer_runtime_->getNbDLACores()) {
-        dla_core_ = 0;
+      if (params_.dla_core < 0 ||
+          params_.dla_core >= infer_runtime_->getNbDLACores()) {
+        params_.dla_core = 0;
         LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
                      << infer_runtime_->getNbDLACores() << ", but got "
-                     << dla_core_ << ", so use use 0 as default.";
+                     << params_.dla_core << ", so use use 0 as default.";
       }
-      infer_runtime_->setDLACore(dla_core_);
+      infer_runtime_->setDLACore(params_.dla_core);
       LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
-                << dla_core_;
+                << params_.dla_core;
     }
   }
 
@@ -602,20 +602,16 @@ void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
 
   binding_num_ = infer_engine_->getNbBindings();
   // for engine context memory sharing
-  if (context_memory_sharing_) {
+  if (params_.context_memory_sharing) {
     inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+        .UpdateContextMemorySize(infer_engine_->getDeviceMemorySize(),
                                  predictor_id_per_thread);
   }
-  if (use_inspector_) {
+  if (params_.use_inspector) {
     GetEngineInfo();
   }
 }
 
-void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
-  runtime_batch_ = batch_size;
-}
-
 // Note: Only for support plugin.
 TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
     const std::string &name, const phi::DenseTensor &weight_tensor) {
@@ -830,8 +826,6 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
   return weight;
 }
 
-int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
-
 nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin(
     nvinfer1::ITensor *const *inputs,
     int num_inputs,
@@ -856,16 +850,16 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt(
   return network()->addPluginV2(inputs, num_inputs, *plugin);
 }
 
-void TensorRTEngine::freshDeviceId() {
+void TensorRTEngine::FreshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_,
+  PADDLE_ENFORCE_LT(device_id(),
                     count,
                     platform::errors::OutOfRange(
                         "Device id %d exceeds the current device count: %d.",
-                        device_id_,
+                        device_id(),
                         count));
-  platform::SetDeviceId(device_id_);
+  platform::SetDeviceId(device_id());
 }
 
 void TensorRTEngine::GetEngineInfo() {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index bd678be2c1c0ededfb286272792436173178b40e..3ad2ca30a4cff99998c0f220f949b2e08d37ab99 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <NvInfer.h>
-
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -25,21 +23,21 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
+#include <NvInfer.h>
 #include "NvInferRuntimeCommon.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
-#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/stream.h"
-#include "paddle/utils/any.h"
 
 PHI_DECLARE_bool(trt_ibuilder_cache);
 
@@ -47,6 +45,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+namespace plugin {
+class PluginTensorRT;
+}  // namespace plugin
+
+class TRTInt8Calibrator;
+
 // The code is mainly from TensorRT, thanks to the project.
 class TrtCudaGraph {
  public:
@@ -105,140 +109,6 @@ class TrtCudaGraph {
   cudaGraphExec_t cuda_graph_exec_{};
 };
 
-namespace plugin {
-class PluginTensorRT;
-}  // namespace plugin
-
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {  // NOLINT
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-    case FluidDT::VarType_Type_FP64:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-    case FluidDT::VarType_Type_INT64:
-      return TRT_DT::kINT32;
-    case FluidDT::VarType_Type_FP16:
-      return TRT_DT::kHALF;
-#if IS_TRT_VERSION_GE(8400)
-    case FluidDT::VarType_Type_BOOL:
-      return TRT_DT::kBOOL;
-
-#endif
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "unsupported datatype in TRT op converter, type: %s. "
-          "Boolean type is supported as TRT input/output "
-          "using TensorRT v8.4+.",
-          VarType_Type_Name(type)));
-  }
-  return TRT_DT::kINT32;
-}
-
-// The T can be int32 or int64 type.
-template <typename T>
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
-                            std::string input,
-                            bool with_dynamic_shape = false) {
-  PADDLE_ENFORCE_GE(shape.size(),
-                    0UL,
-                    platform::errors::InvalidArgument(
-                        "TensorRT's tensor input requires at least 0 "
-                        "dimensions, but input %s has %d dims.",
-                        input,
-                        shape.size()));
-
-  auto ShapeStr = [](const std::vector<T>& shape) {
-    std::ostringstream os;
-    os << "[";
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (i == shape.size() - 1) {
-        os << shape[i];
-      } else {
-        os << shape[i] << ",";
-      }
-    }
-    os << "]";
-    return os.str();
-  };
-  if (!with_dynamic_shape) {
-    if (shape.size() == 4UL) {
-      if (shape[2] == -1 || shape[3] == -1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The input [%s] shape of trt subgraph is %s, please enable "
-            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input,
-            ShapeStr(shape)));
-      }
-      return nvinfer1::Dims3(shape[1], shape[2], shape[3]);
-    } else if (shape.size() == 5UL) {
-      if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The input [%s] shape of trt subgraph is %s, please enable "
-            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input,
-            ShapeStr(shape)));
-      }
-      return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]);
-    } else if (shape.size() == 3UL) {
-      if (shape[1] == -1 || shape[2] == -1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The input [%s] shape of trt subgraph is %s, please enable "
-            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input,
-            ShapeStr(shape)));
-      }
-      return nvinfer1::Dims2(shape[1], shape[2]);
-    } else if (shape.size() == 2UL) {
-      if (shape[1] == -1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The input [%s] shape of trt subgraph is %s, please enable "
-            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input,
-            ShapeStr(shape)));
-      }
-      nvinfer1::Dims dims;
-      dims.nbDims = 1;
-      dims.d[0] = shape[1];
-      return dims;
-    }
-    // static shape doesn't support 1D op so far.
-    PADDLE_ENFORCE_NE(shape.size(),
-                      1UL,
-                      platform::errors::InvalidArgument(
-                          "The input [%s] shape of trt subgraph is %s."
-                          "it's not supported by trt so far",
-                          input,
-                          ShapeStr(shape)));
-
-    nvinfer1::Dims dims;
-    dims.nbDims = shape.size() - 1;
-    for (size_t i = 1; i < shape.size(); i++) {
-      dims.d[i - 1] = shape[i];
-    }
-    return dims;
-  } else {
-    if (shape.size() == 4UL) {
-      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
-    } else if (shape.size() == 3UL) {
-      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
-    }
-    nvinfer1::Dims dims;
-    dims.nbDims = shape.size();
-    for (size_t i = 0; i < shape.size(); i++) {
-      dims.d[i] = shape[i];
-    }
-    return dims;
-  }
-}
-}  // namespace
-
-class TRTInt8Calibrator;
-
 /*
  * TensorRT Engine.
  *
@@ -251,6 +121,54 @@ class TensorRTEngine {
   using PredictorID = int;
 
  public:
+  /*
+   * Construction parameters of TensorRTEngine.
+   */
+  struct ConstructionParams {
+    // The max batch size.
+    int32_t max_batch_size;
+
+    // The max memory size the engine uses.
+    int64_t max_workspace_size;
+
+    // The precision of engine.
+    phi::DataType precision{phi::DataType::FLOAT32};
+
+    TRTInt8Calibrator* calibrator{nullptr};
+
+    // Use for engine context memory sharing.
+    bool context_memory_sharing{false};
+
+    int device_id{0};
+
+    bool with_dynamic_shape{false};
+
+    bool use_dla{false};
+    int dla_core{0};
+
+    ShapeMapType min_input_shape;
+    ShapeMapType max_input_shape;
+    ShapeMapType optim_input_shape;
+    ShapeMapType min_shape_tensor;
+    ShapeMapType max_shape_tensor;
+    ShapeMapType optim_shape_tensor;
+
+    bool use_inspector{false};
+
+    //
+    // From tensorrt_subgraph_pass, only used for OpConverter.
+    //
+    bool use_varseqlen{false};
+    bool with_interleaved{false};
+    std::string tensorrt_transformer_posid;
+    std::string tensorrt_transformer_maskid;
+    bool enable_low_precision_io{false};
+    // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
+    // not run fp16. When running fp16, the output accuracy of the model will be
+    // affected, closing the plugin fp16 may bring some improvement on accuracy.
+    bool disable_trt_plugin_fp16{false};
+  };
+
   // Weight is model parameter.
   class Weight {
    public:
@@ -276,47 +194,14 @@ class TensorRTEngine {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch,
-                 int64_t max_workspace,
-                 phi::DataType precision = phi::DataType::FLOAT32,
-                 TRTInt8Calibrator* calibrator = nullptr,
-                 int device_id = 0,
-                 bool with_dynamic_shape = false,
-                 const ShapeMapType& min_input_shape = {},
-                 const ShapeMapType& max_input_shape = {},
-                 const ShapeMapType& optim_input_shape = {},
-                 const ShapeMapType& min_shape_tensor = {},
-                 const ShapeMapType& max_shape_tensor = {},
-                 const ShapeMapType& optim_shape_tensor = {},
-                 bool disable_trt_plugin_fp16 = false,
-                 phi::DataType model_precision = phi::DataType::FLOAT32,
+  TensorRTEngine(const ConstructionParams& params,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
-      : max_batch_(max_batch),
-        max_workspace_(max_workspace),
-        precision_(precision),
-        calibrator_(calibrator),
-        device_id_(device_id),
-        with_dynamic_shape_(with_dynamic_shape),
-        min_input_shape_(min_input_shape),
-        max_input_shape_(max_input_shape),
-        optim_input_shape_(optim_input_shape),
-        min_shape_tensor_(min_shape_tensor),
-        max_shape_tensor_(max_shape_tensor),
-        optim_shape_tensor_(optim_shape_tensor),
-        disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
-        model_precision_(model_precision),
-        logger_(logger) {
-    dy::initLibNvInferPlugins(&logger, "");
-  }
-
-  ~TensorRTEngine() {
-    for (auto& attr : attrs_) {
-      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
-        attr_dels_[attr.first]();
-      }
-    }
-    attrs_.clear();
-    attr_dels_.clear();
+      : params_(params), logger_(logger) {
+    dy::initLibNvInferPlugins(&logger_, "");
+    static std::once_flag trt_plugin_registered;
+    std::call_once(trt_plugin_registered, []() {
+      tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+    });
   }
 
   // Add an input and set its name, data type and dimension.
@@ -346,15 +231,6 @@ class TensorRTEngine {
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::IExecutionContext* context();
 
-  int GetProfileIndex() {
-    if (max_profile_num_ > 1) {
-      std::unique_lock<std::mutex> lock(mutex_);
-      return profile_index_[predictor_id_per_thread];
-    } else {
-      return 0;
-    }
-  }
-
   int GetBindingsOffset() {
     return (binding_num_ / max_profile_num_) * GetProfileIndex();
   }
@@ -390,25 +266,20 @@ class TensorRTEngine {
 
   void Deserialize(const std::string& engine_serialized_data);
 
-  void SetRuntimeBatch(size_t batch_size);
-  int GetRuntimeBatch();
-
   bool WithFp16() {
-    bool enable_fp16 = (precision_ == phi::DataType::FLOAT16);
+    bool enable_fp16 = (precision() == phi::DataType::FLOAT16);
     bool support_fp16 = infer_builder_->platformHasFastFp16();
     // below is consistent with setFlag in engine.cc
-    bool fall_back_fp16 = WithInt8() && !use_dla_;
+    bool fall_back_fp16 = WithInt8() && !use_dla();
     return (enable_fp16 || fall_back_fp16) && support_fp16;
   }
 
   bool WithInt8() {
-    bool enable_int8 = (precision_ == phi::DataType::INT8);
+    bool enable_int8 = (precision() == phi::DataType::INT8);
     bool support_int8 = infer_builder_->platformHasFastInt8();
     return enable_int8 && support_int8;
   }
 
-  int GetDeviceId() { return device_id_; }
-
   nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                       int num_inputs,
                                       plugin::PluginTensorRT*);
@@ -471,19 +342,6 @@ class TensorRTEngine {
     suffix_counter += 1;
   }
 
-  void SetUseOSS(bool use_varseqlen) { use_varseqlen_ = use_varseqlen; }
-  void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
-  void SetDLACore(int dla_core) { dla_core_ = dla_core; }
-  void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
-  void SetWithInterleaved(bool with_interleaved) {
-    with_interleaved_ = with_interleaved;
-  }
-  void SetTransformerPosid(std::string tensorrt_transformer_posid) {
-    tensorrt_transformer_posid_ = tensorrt_transformer_posid;
-  }
-  void SetTransformerMaskid(std::string tensorrt_transformer_maskid) {
-    tensorrt_transformer_maskid_ = tensorrt_transformer_maskid;
-  }
   void ClearWeights() {
     for (auto& weight_pair : weight_map) {
       weight_pair.second.reset(nullptr);
@@ -508,12 +366,12 @@ class TensorRTEngine {
 
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
 
-  ShapeMapType& min_input_shape() { return min_input_shape_; }
-  ShapeMapType& max_input_shape() { return max_input_shape_; }
-  ShapeMapType& optim_input_shape() { return optim_input_shape_; }
-  ShapeMapType& min_shape_tensor() { return min_shape_tensor_; }
-  ShapeMapType& max_shape_tensor() { return max_shape_tensor_; }
-  ShapeMapType& optim_shape_tensor() { return optim_shape_tensor_; }
+  ShapeMapType& min_input_shape() { return params_.min_input_shape; }
+  ShapeMapType& max_input_shape() { return params_.max_input_shape; }
+  ShapeMapType& optim_input_shape() { return params_.optim_input_shape; }
+  ShapeMapType& min_shape_tensor() { return params_.min_shape_tensor; }
+  ShapeMapType& max_shape_tensor() { return params_.max_shape_tensor; }
+  ShapeMapType& optim_shape_tensor() { return params_.optim_shape_tensor; }
 
   bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
                                const ShapeMapType& runtime_shape_tensor,
@@ -526,22 +384,22 @@ class TensorRTEngine {
       auto name = it.first;
       auto input_shape = it.second;
       // Make 0-D tensor to 1-D tensor.
-      if (input_shape.size() == 0) {
+      if (input_shape.empty()) {
         input_shape.push_back(1);
       }
       bool min_change = false;
       bool max_change = false;
       std::vector<int> bak_min_shape;
       std::vector<int> bak_max_shape;
-      if (!min_input_shape_.count(name)) {
-        min_input_shape_[name] = input_shape;
-        max_input_shape_[name] = input_shape;
-        optim_input_shape_[name] = input_shape;
+      if (!params_.min_input_shape.count(name)) {
+        params_.min_input_shape[name] = input_shape;
+        params_.max_input_shape[name] = input_shape;
+        params_.optim_input_shape[name] = input_shape;
         min_change = true;
         max_change = true;
         ret = true;
       } else {
-        PADDLE_ENFORCE_EQ(min_input_shape_[name].size(),
+        PADDLE_ENFORCE_EQ(params_.min_input_shape[name].size(),
                           input_shape.size(),
                           platform::errors::InvalidArgument(
                               "TRT dynamic_shape min_input_shape %s size not "
@@ -549,33 +407,33 @@ class TensorRTEngine {
                               ", but the runtime_input_shape[%s].size()=%d.",
                               name,
                               name,
-                              min_input_shape_[name].size(),
+                              params_.min_input_shape[name].size(),
                               name,
                               input_shape.size()));
 
-        bak_min_shape = min_input_shape_[name];
-        bak_max_shape = max_input_shape_[name];
+        bak_min_shape = params_.min_input_shape[name];
+        bak_max_shape = params_.max_input_shape[name];
         for (size_t d = 0; d < input_shape.size(); ++d) {
-          if (input_shape[d] < min_input_shape_[name][d]) {
+          if (input_shape[d] < params_.min_input_shape[name][d]) {
             ret = true;
             min_change = true;
-            min_input_shape_[name][d] = input_shape[d];
+            params_.min_input_shape[name][d] = input_shape[d];
           }
-          if (input_shape[d] > max_input_shape_[name][d]) {
+          if (input_shape[d] > params_.max_input_shape[name][d]) {
             ret = true;
             max_change = true;
-            max_input_shape_[name][d] = input_shape[d];
+            params_.max_input_shape[name][d] = input_shape[d];
           }
         }
       }
       if (min_change)
         LOG(INFO) << "refactor tensor shape range: " << name
                   << ", min_shape from " << Vec2Str(bak_min_shape) << " to "
-                  << Vec2Str(min_input_shape_[name]);
+                  << Vec2Str(params_.min_input_shape[name]);
       if (max_change)
         LOG(INFO) << "refactor tensor shape range: " << name
                   << ", max_shape from " << Vec2Str(bak_max_shape) << " to "
-                  << Vec2Str(max_input_shape_[name]);
+                  << Vec2Str(params_.max_input_shape[name]);
       if (min_change || max_change) changed->push_back(name);
     }
     for (const auto& it : runtime_shape_tensor) {
@@ -585,15 +443,15 @@ class TensorRTEngine {
       bool max_change = false;
       std::vector<int> bak_min_shape;
       std::vector<int> bak_max_shape;
-      if (!min_shape_tensor_.count(name)) {
-        min_shape_tensor_[name] = shape_tensor;
-        max_shape_tensor_[name] = shape_tensor;
-        optim_shape_tensor_[name] = shape_tensor;
+      if (!params_.min_shape_tensor.count(name)) {
+        params_.min_shape_tensor[name] = shape_tensor;
+        params_.max_shape_tensor[name] = shape_tensor;
+        params_.optim_shape_tensor[name] = shape_tensor;
         min_change = true;
         max_change = true;
         ret = true;
       } else {
-        PADDLE_ENFORCE_EQ(min_shape_tensor_[name].size(),
+        PADDLE_ENFORCE_EQ(params_.min_shape_tensor[name].size(),
                           shape_tensor.size(),
                           platform::errors::InvalidArgument(
                               "TRT dynamic_shape min_shape_tensor %s size not "
@@ -601,50 +459,50 @@ class TensorRTEngine {
                               ", but the runtime_shape_tensor[%s].size()=%d.",
                               name,
                               name,
-                              min_shape_tensor_[name].size(),
+                              params_.min_shape_tensor[name].size(),
                               name,
                               shape_tensor.size()));
 
-        bak_min_shape = min_shape_tensor_[name];
-        bak_max_shape = max_shape_tensor_[name];
+        bak_min_shape = params_.min_shape_tensor[name];
+        bak_max_shape = params_.max_shape_tensor[name];
         for (size_t d = 0; d < shape_tensor.size(); ++d) {
-          if (shape_tensor[d] < min_shape_tensor_[name][d]) {
+          if (shape_tensor[d] < params_.min_shape_tensor[name][d]) {
             ret = true;
             min_change = true;
-            min_shape_tensor_[name][d] = shape_tensor[d];
+            params_.min_shape_tensor[name][d] = shape_tensor[d];
           }
-          if (shape_tensor[d] > max_shape_tensor_[name][d]) {
+          if (shape_tensor[d] > params_.max_shape_tensor[name][d]) {
             ret = true;
             max_change = true;
-            max_shape_tensor_[name][d] = shape_tensor[d];
+            params_.max_shape_tensor[name][d] = shape_tensor[d];
           }
         }
       }
       if (min_change)
         LOG(INFO) << "refactor shape tensor range: " << name
                   << ", min_shape from " << Vec2Str(bak_min_shape) << " to "
-                  << Vec2Str(min_shape_tensor_[name]);
+                  << Vec2Str(params_.min_shape_tensor[name]);
       if (max_change)
         LOG(INFO) << "refactor shape tensor range: " << name
                   << ", max_shape from " << Vec2Str(bak_max_shape) << " to "
-                  << Vec2Str(max_shape_tensor_[name]);
+                  << Vec2Str(params_.max_shape_tensor[name]);
       if (min_change || max_change) tensor_changed->push_back(name);
     }
     return ret;
   }
 
-  bool use_varseqlen() { return use_varseqlen_; }
-  bool with_ernie() { return with_ernie_; }
-  bool with_interleaved() { return with_interleaved_; }
-  std::string tensorrt_transformer_posid() {
-    return tensorrt_transformer_posid_;
+  bool use_varseqlen() { return params_.use_varseqlen; }
+  bool use_dla() { return params_.use_dla; }
+  bool with_interleaved() { return params_.with_interleaved; }
+  const std::string& tensorrt_transformer_posid() {
+    return params_.tensorrt_transformer_posid;
   }
-  std::string tensorrt_transformer_maskid() {
-    return tensorrt_transformer_maskid_;
+  const std::string& tensorrt_transformer_maskid() {
+    return params_.tensorrt_transformer_maskid;
   }
-  bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
-  bool with_dynamic_shape() { return with_dynamic_shape_; }
-  phi::DataType precision() { return precision_; }
+  bool disable_trt_plugin_fp16() { return params_.disable_trt_plugin_fp16; }
+  bool with_dynamic_shape() { return params_.with_dynamic_shape; }
+  phi::DataType precision() { return params_.precision; }
 
 #if IS_TRT_VERSION_GE(6000)
   nvinfer1::IPluginV2Layer* AddDynamicPlugin(
@@ -656,153 +514,50 @@ class TensorRTEngine {
   }
 #endif
 
-  bool Has(const std::string& attr_name) const {
-    return attrs_.count(attr_name) > 0;
-  }
-
-  void Erase(const std::string& attr_name) {
-    if (!Has(attr_name)) {
-      return;
-    }
-    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
-      attr_dels_[attr_name]();
-      attr_dels_.erase(attr_name);
-    }
-    attrs_.erase(attr_name);
-  }
-
-  // Set a pointer to the attribute. Engine takes ownership of the attribute.
-  template <typename AttrType>
-  void Set(const std::string& attr_name, AttrType* attr) {
-    if (attrs_.count(attr_name) == 0) {
-      PADDLE_ENFORCE_EQ(
-          attrs_.count(attr_name),
-          0,
-          platform::errors::AlreadyExists(
-              "Attribute %s already set in trt engine.", attr_name));
-    } else {
-      VLOG(3) << "Setting the attribute " << attr_name << " for trt engine "
-              << this;
-    }
-    attrs_[attr_name] = attr;
-    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
-      delete attr;
-    };
-  }
-
-  // Set a pointer to the attribute. Engine doesn't take ownership. Caller
-  // should delete the attribute.
-  template <typename AttrType>
-  void SetNotOwned(const std::string& attr_name, AttrType* attr) {
-    PADDLE_ENFORCE_EQ(
-        attrs_.count(attr_name),
-        0,
-        platform::errors::AlreadyExists(
-            "Attribute %s already set in trt engine.", attr_name));
-    attrs_[attr_name] = attr;
-  }
-
-  // Get a reference to the attributed previously set.
-  template <typename AttrType>
-  AttrType& Get(const std::string& attr_name) const {
-    PADDLE_ENFORCE_NE(attrs_.find(attr_name),
-                      attrs_.end(),
-                      platform::errors::InvalidArgument(
-                          "Attribute %s not found in trt engine.", attr_name));
-    try {
-      return *paddle::any_cast<AttrType*>(attrs_.at(attr_name));
-    } catch (paddle::bad_any_cast&) {
-      auto TypeToString = [](const std::type_info& info) -> std::string {
-        if (std::type_index(info) == std::type_index(typeid(bool*))) {
-          return "bool";
-        } else if (std::type_index(info) == std::type_index(typeid(int*))) {
-          return "int";
-        } else if (std::type_index(info) ==
-                   std::type_index(typeid(const int*))) {
-          return "const int";
-        } else if (std::type_index(info) ==
-                   std::type_index(typeid(std::string*))) {
-          return "std::string";
-        }
-        return info.name();
-      };
-
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid type for attritube %s, expected: %s, actual: %s.",
-          attr_name,
-          TypeToString(typeid(AttrType*)),
-          TypeToString(attrs_.at(attr_name).type())));
-    }
-  }
-
   void SetProfileNum(int num) { max_profile_num_ = num; }
 
-  void GetEngineInfo();
-
-  void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
-  void SetScope(const framework::Scope& scope) { scope_ = &scope; }
-
-  void SetContextMemorySharing(bool context_memory_sharing) {
-    context_memory_sharing_ = context_memory_sharing;
-  }
-
-  void SetLowPrecisionIO(bool low_precision_io) {
-    low_precision_io_ = low_precision_io;
-  }
-
-  bool EnableLowPrecisionIO() const { return low_precision_io_; }
+  void SetScope(const framework::Scope* scope) { scope_ = scope; }
 
   void SetAllNodesLowerToTrt(bool all_nodes_offload_to_trt) {
     // all nodes are in trt, so we can use cudaGraph to optimize runtime.
     startup_with_cudagraph_ = all_nodes_offload_to_trt;
   }
 
+  bool LowPrecisionIOEnabled() const { return params_.enable_low_precision_io; }
+
  private:
   // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
   // ensure that the thread is associated with the correct device by calling
-  // freshDeviceId().
-  void freshDeviceId();
-  // Used for convert weight into Itensor
-  const framework::Scope* scope_;
+  // FreshDeviceId().
+  void FreshDeviceId();
 
-  // the max batch size
-  int max_batch_;
-  // the runtime batch size
-  static int runtime_batch_;
-  // the max memory size the engine uses
-  int64_t max_workspace_;
+  void GetEngineInfo();
 
-  phi::DataType precision_;
-  TRTInt8Calibrator* calibrator_;
-  // batch size of the current data, will be updated each Executation.
-  int batch_size_{-1};
+  int device_id() { return params_.device_id; }
 
-  // use for engine context memory sharing
-  bool context_memory_sharing_{false};
+  int GetProfileIndex() {
+    if (max_profile_num_ > 1) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      return profile_index_[predictor_id_per_thread];
+    } else {
+      return 0;
+    }
+  }
 
-  bool low_precision_io_{false};
+ private:
+  //
+  // Construction parameters.
+  //
+  ConstructionParams params_;
+
+  //
+  // The following are runtime parameters.
+  //
 
-  int device_id_;
   int max_profile_num_{1};
   int cur_profile_num_{0};
   std::unordered_map<PredictorID, int> profile_index_;
-  bool with_dynamic_shape_{false};
-  ShapeMapType min_input_shape_;
-  ShapeMapType max_input_shape_;
-  ShapeMapType optim_input_shape_;
-  ShapeMapType min_shape_tensor_;
-  ShapeMapType max_shape_tensor_;
-  ShapeMapType optim_shape_tensor_;
-  bool disable_trt_plugin_fp16_{false};
-  phi::DataType model_precision_{phi::DataType::FLOAT32};
-  bool use_varseqlen_{false};
-  bool use_dla_{false};
-  int dla_core_{0};
-  bool with_ernie_{false};
-  bool with_interleaved_{false};
-  std::string tensorrt_transformer_posid_;
-  std::string tensorrt_transformer_maskid_;
+
   nvinfer1::ILogger& logger_;
 
   // max data size for the buffers.
@@ -828,8 +583,9 @@ class TensorRTEngine {
   bool cudagraph_inited_{false};
   bool startup_with_cudagraph_{false};
 
-  std::unordered_map<std::string, paddle::any> attrs_;
-  std::unordered_map<std::string, std::function<void(void)>> attr_dels_;
+  // Used for convert weight into Itensor
+  const framework::Scope* scope_{nullptr};
+
 #if IS_TRT_VERSION_GE(6000)
   int binding_num_;
   infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
@@ -837,7 +593,6 @@ class TensorRTEngine {
   std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
 #endif
   std::mutex mutex_;
-  bool use_inspector_;
 
  public:
   thread_local static int predictor_id_per_thread;
@@ -872,7 +627,7 @@ class TRTEngineManager {
 
   bool Empty() const {
     std::lock_guard<std::mutex> lock(mutex_);
-    return engines_.size() == 0;
+    return engines_.empty();
   }
 
   bool Has(const std::string& name) const {
@@ -886,41 +641,13 @@ class TRTEngineManager {
     return engines_.at(name).get();
   }
 
-  TensorRTEngine* Create(
-      std::string name,
-      int max_batch,
-      int64_t max_workspace,
-      phi::DataType precision = phi::DataType::FLOAT32,
-      TRTInt8Calibrator* calibrator = nullptr,
-      int device_id = 0,
-      bool with_dynamic_shape = false,
-      const std::map<std::string, std::vector<int>> min_input_shape = {},
-      const std::map<std::string, std::vector<int>> max_input_shape = {},
-      const std::map<std::string, std::vector<int>> optim_input_shape = {},
-      const std::map<std::string, std::vector<int>> min_shape_tensor = {},
-      const std::map<std::string, std::vector<int>> max_shape_tensor = {},
-      const std::map<std::string, std::vector<int>> optim_shape_tensor = {},
-      bool disable_trt_plugin_fp16 = false,
-      phi::DataType model_precision = phi::DataType::FLOAT32,
-      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p = new TensorRTEngine(max_batch,
-                                 max_workspace,
-                                 precision,
-                                 calibrator,
-                                 device_id,
-                                 with_dynamic_shape,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 min_shape_tensor,
-                                 max_shape_tensor,
-                                 optim_shape_tensor,
-                                 disable_trt_plugin_fp16,
-                                 model_precision,
-                                 logger);
+  TensorRTEngine* Create(const std::string& name,
+                         const TensorRTEngine::ConstructionParams& params,
+                         nvinfer1::ILogger& logger = NaiveLogger::Global()) {
+    auto engine = std::make_unique<TensorRTEngine>(params, logger);
     std::lock_guard<std::mutex> lock(mutex_);
-    engines_[name].reset(p);
-    return p;
+    engines_[name].reset(engine.release());
+    return engines_[name].get();
   }
 
   void DeleteAll() {
@@ -940,7 +667,7 @@ class TRTEngineManager {
     }
   }
 
-  void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
+  void UpdateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
     VLOG(3) << "TensorRT engine context memory size is "
             << mem_size / 1024.0 / 1024.0 << "MiB in predictor id "
             << predictor_id;
@@ -955,24 +682,24 @@ class TRTEngineManager {
     }
 
     if (size_updated) {
-      releaseContextMemory(predictor_id);
+      ReleaseContextMemory(predictor_id);
     }
   }
 
-  void* getContextMemory(PredictorID predictor_id,
+  void* GetContextMemory(PredictorID predictor_id,
                          const phi::GPUPlace& place,
                          const phi::Stream& stream) {
     std::lock_guard<std::mutex> lock(mutex_);
-    static auto alignment = getAlignmentSize(place);
+    static auto alignment = GetAlignmentSize(place);
     if (context_memorys_.count(predictor_id) == 0) {
       auto context_memory =
           memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
       context_memorys_[predictor_id] = std::move(context_memory);
     }
-    return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
+    return GetAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
   }
 
-  void releaseContextMemory(PredictorID predictor_id) {
+  void ReleaseContextMemory(PredictorID predictor_id) {
     std::lock_guard<std::mutex> lock(mutex_);
     if (context_memorys_.count(predictor_id)) {
       context_memorys_[predictor_id].reset(nullptr);
@@ -981,12 +708,12 @@ class TRTEngineManager {
   }
 
  private:
-  size_t getAlignmentSize(const phi::GPUPlace& place) {
+  size_t GetAlignmentSize(const phi::GPUPlace& place) {
     const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
     return prop.textureAlignment;
   }
 
-  void* getAlignedMemory(void* addr, size_t alignment) {
+  void* GetAlignedMemory(void* addr, size_t alignment) {
     return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
   }
 
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index e744b261cbcfada9d4321ba4c412f8938be190f9..3f0152b2909c0826d3c41daddd5af96ab1fd3b95 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -148,7 +148,7 @@ class NaiveProfiler : public nvinfer1::IProfiler {
   typedef std::pair<std::string, float> Record;
   std::vector<Record> mProfile;
 
-  virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT {
+  void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT override {
     auto record =
         std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r) {
           return r.first == layerName;
@@ -235,6 +235,130 @@ static inline nvinfer1::DataType PhiType2NvType(phi::DataType type) {
   return nv_type;
 }
 
+using FluidDT = paddle::framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+static TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+    case FluidDT::VarType_Type_FP64:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+    case FluidDT::VarType_Type_INT64:
+      return TRT_DT::kINT32;
+    case FluidDT::VarType_Type_FP16:
+      return TRT_DT::kHALF;
+#if IS_TRT_VERSION_GE(8400)
+    case FluidDT::VarType_Type_BOOL:
+      return TRT_DT::kBOOL;
+
+#endif
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "unsupported datatype in TRT op converter, type: %s. "
+          "Boolean type is supported as TRT input/output "
+          "using TensorRT v8.4+.",
+          VarType_Type_Name(type)));
+  }
+  return TRT_DT::kINT32;
+}
+
+// The T can be int32 or int64 type.
+template <typename T>
+static nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
+                                   std::string input,
+                                   bool with_dynamic_shape = false) {
+  PADDLE_ENFORCE_GE(shape.size(),
+                    0UL,
+                    paddle::platform::errors::InvalidArgument(
+                        "TensorRT's tensor input requires at least 0 "
+                        "dimensions, but input %s has %d dims.",
+                        input,
+                        shape.size()));
+
+  auto ShapeStr = [](const std::vector<T>& shape) {
+    std::ostringstream os;
+    os << "[";
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (i == shape.size() - 1) {
+        os << shape[i];
+      } else {
+        os << shape[i] << ",";
+      }
+    }
+    os << "]";
+    return os.str();
+  };
+  if (!with_dynamic_shape) {
+    if (shape.size() == 4UL) {
+      if (shape[2] == -1 || shape[3] == -1) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input,
+            ShapeStr(shape)));
+      }
+      return nvinfer1::Dims3(shape[1], shape[2], shape[3]);
+    } else if (shape.size() == 5UL) {
+      if (shape[2] == -1 || shape[3] == -1 || shape[4] == -1) {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input,
+            ShapeStr(shape)));
+      }
+      return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]);
+    } else if (shape.size() == 3UL) {
+      if (shape[1] == -1 || shape[2] == -1) {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input,
+            ShapeStr(shape)));
+      }
+      return nvinfer1::Dims2(shape[1], shape[2]);
+    } else if (shape.size() == 2UL) {
+      if (shape[1] == -1) {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input,
+            ShapeStr(shape)));
+      }
+      nvinfer1::Dims dims;
+      dims.nbDims = 1;
+      dims.d[0] = shape[1];
+      return dims;
+    }
+    // static shape doesn't support 1D op so far.
+    PADDLE_ENFORCE_NE(shape.size(),
+                      1UL,
+                      paddle::platform::errors::InvalidArgument(
+                          "The input [%s] shape of trt subgraph is %s."
+                          "it's not supported by trt so far",
+                          input,
+                          ShapeStr(shape)));
+
+    nvinfer1::Dims dims;
+    dims.nbDims = shape.size() - 1;
+    for (size_t i = 1; i < shape.size(); i++) {
+      dims.d[i - 1] = shape[i];
+    }
+    return dims;
+  } else {
+    if (shape.size() == 4UL) {
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    } else if (shape.size() == 3UL) {
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    }
+    nvinfer1::Dims dims;
+    dims.nbDims = shape.size();
+    for (size_t i = 0; i < shape.size(); i++) {
+      dims.d[i] = shape[i];
+    }
+    return dims;
+  }
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index fcc4f2bfcf7e245dccb6883e7dd834e6459271ea..8d2eb4cb4919b4372be3787fcddb05524afe7ac9 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
 #endif
@@ -35,7 +36,7 @@ namespace tensorrt {
 class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -65,29 +66,21 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
         {"shape", {18, 8, 4}}};
     std::map<std::string, std::vector<int>> optim_input_value = {
         {"shape", {18, 8, 4}}};
-    engine_ = new TensorRTEngine(16,
-                                 1 << 10,
-                                 phi::DataType::FLOAT32,
-                                 nullptr,
-                                 0,
-                                 true,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 min_input_value,
-                                 max_input_value,
-                                 optim_input_value,
-                                 false,
-                                 phi::DataType::FLOAT32,
-                                 NaiveLogger::Global());
-    engine_->InitNetwork();
-  }
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 16;
+    params.max_workspace_size = 1 << 10;
+    params.with_dynamic_shape = true;
+    params.min_input_shape = min_input_shape;
+    params.max_input_shape = max_input_shape;
+    params.optim_input_shape = optim_input_shape;
+    params.min_shape_tensor = min_input_value;
+    params.max_shape_tensor = max_input_value;
+    params.optim_shape_tensor = optim_input_value;
+
+    engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
+
+    engine_->InitNetwork();
   }
 
   void PrepareInputOutput(const std::vector<float> &input,
@@ -106,8 +99,8 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
   phi::DenseTensor input_;
   phi::DenseTensor shape_;
   phi::DenseTensor output_;
-  TensorRTEngine *engine_;
-  phi::GPUContext *ctx_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  std::unique_ptr<phi::GPUContext> ctx_;
 };
 
 TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
@@ -167,7 +160,7 @@ TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
 class TensorRTDynamicEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -192,29 +185,18 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
     std::map<std::string, std::vector<int>> optim_input_shape = {
         {"input", {16, 32, 1, 1}}};
 
-    engine_ = new TensorRTEngine(16,
-                                 1 << 10,
-                                 phi::DataType::FLOAT16,
-                                 nullptr,
-                                 0,
-                                 true,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 false,
-                                 phi::DataType::FLOAT32,
-                                 NaiveLogger::Global());
-    engine_->InitNetwork();
-  }
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 16;
+    params.max_workspace_size = 1 << 10;
+    params.with_dynamic_shape = true;
+    params.precision = phi::DataType::FLOAT16;
+    params.min_input_shape = min_input_shape;
+    params.max_input_shape = max_input_shape;
+    params.optim_input_shape = optim_input_shape;
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
+    engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
+
+    engine_->InitNetwork();
   }
 
   void PrepareInputOutput(const std::vector<float16> &input,
@@ -230,8 +212,8 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
  protected:
   phi::DenseTensor input_;
   phi::DenseTensor output_;
-  TensorRTEngine *engine_;
-  phi::GPUContext *ctx_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  std::unique_ptr<phi::GPUContext> ctx_;
 };
 
 TEST_F(TensorRTDynamicEngineTest, test_spmm) {
@@ -336,7 +318,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
 class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -370,29 +352,18 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
         {"mask", {4, 1, 4, 4}},
         {"new_mask", {4, 1, 2, 2}}};
 
-    engine_ = new TensorRTEngine(16,
-                                 1 << 10,
-                                 phi::DataType::FLOAT32,
-                                 nullptr,
-                                 0,
-                                 true,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 false,
-                                 phi::DataType::FLOAT32,
-                                 NaiveLogger::Global());
-    engine_->InitNetwork();
-  }
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 16;
+    params.max_workspace_size = 1 << 10;
+    params.precision = phi::DataType::FLOAT32;
+    params.with_dynamic_shape = true;
+    params.min_input_shape = min_input_shape;
+    params.max_input_shape = max_input_shape;
+    params.optim_input_shape = optim_input_shape;
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
+    engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
+
+    engine_->InitNetwork();
   }
 
   void PrepareInputOutput(const std::vector<std::vector<float>> inputs,
@@ -419,13 +390,12 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
  protected:
   std::vector<phi::DenseTensor> inputs_;
   std::vector<phi::DenseTensor> outputs_;
-  TensorRTEngine *engine_;
-  phi::GPUContext *ctx_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  std::unique_ptr<phi::GPUContext> ctx_;
 };
 
 TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
 #if IS_TRT_VERSION_GE(8000)
-  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
   auto *attn = engine_->DeclareInput(
       "attn", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 4});
   auto *x = engine_->DeclareInput(
@@ -545,7 +515,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
 class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -579,29 +549,17 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
         {"mask", {4, 1, 4, 4}},
         {"new_mask", {4, 1, 2, 2}}};
 
-    engine_ = new TensorRTEngine(16,
-                                 1 << 10,
-                                 phi::DataType::FLOAT16,
-                                 nullptr,
-                                 0,
-                                 true,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 std::map<std::string, std::vector<int>>(),
-                                 false,
-                                 phi::DataType::FLOAT32,
-                                 NaiveLogger::Global());
-    engine_->InitNetwork();
-  }
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 16;
+    params.max_workspace_size = 1 << 10;
+    params.precision = phi::DataType::FLOAT16;
+    params.with_dynamic_shape = true;
+    params.min_input_shape = min_input_shape;
+    params.max_input_shape = max_input_shape;
+    params.optim_input_shape = optim_input_shape;
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
+    engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
+    engine_->InitNetwork();
   }
 
   void PrepareInputOutput(const std::vector<std::vector<float16>> inputs,
@@ -628,13 +586,12 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
  protected:
   std::vector<phi::DenseTensor> inputs_;
   std::vector<phi::DenseTensor> outputs_;
-  TensorRTEngine *engine_;
-  phi::GPUContext *ctx_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  std::unique_ptr<phi::GPUContext> ctx_;
 };
 
 TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) {
 #if IS_TRT_VERSION_GE(8000)
-  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
   auto *attn = engine_->DeclareInput(
       "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims2{-1, 4});
   auto *x = engine_->DeclareInput(
@@ -754,7 +711,7 @@ TEST_F(TensorRTDynamicTestFusedTokenPruneHalf, test_fused_token_prune) {
 class TensorRTDynamicShapeGNTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_ = std::make_unique<phi::GPUContext>(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -782,29 +739,18 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
     std::map<std::string, std::vector<int>> max_input_value = {};
     std::map<std::string, std::vector<int>> optim_input_value = {};
 
-    engine_ = new TensorRTEngine(16,
-                                 1 << 10,
-                                 phi::DataType::INT8,
-                                 nullptr,
-                                 0,
-                                 true,
-                                 min_input_shape,
-                                 max_input_shape,
-                                 optim_input_shape,
-                                 min_input_value,
-                                 max_input_value,
-                                 optim_input_value,
-                                 false,
-                                 phi::DataType::FLOAT32,
-                                 NaiveLogger::Global());
-    engine_->InitNetwork();
-  }
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 16;
+    params.max_workspace_size = 1 << 10;
+    params.precision = phi::DataType::INT8;
+    params.with_dynamic_shape = true;
+    params.min_input_shape = min_input_shape;
+    params.max_input_shape = max_input_shape;
+    params.optim_input_shape = optim_input_shape;
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
+    engine_ = std::make_unique<TensorRTEngine>(params, NaiveLogger::Global());
+
+    engine_->InitNetwork();
   }
 
   void PrepareInputOutput(const std::vector<float> &input,
@@ -923,8 +869,8 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
  protected:
   phi::DenseTensor x_;
   phi::DenseTensor y_;
-  TensorRTEngine *engine_;
-  phi::GPUContext *ctx_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  std::unique_ptr<phi::GPUContext> ctx_;
   // case from SD
   int n_ = 2;
   int c_ = 320;
@@ -942,8 +888,6 @@ class TensorRTDynamicShapeGNTest : public ::testing::Test {
 
 /*
 TEST_F(TensorRTDynamicShapeGNTest, test_trt_dynamic_shape_groupnorm) {
-  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
-
   float *bias = new float[c_];
   float *scale = new float[c_];
   for (int i = 0; i < c_; i++) {
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 23a0df7316724a9ab6eece77a73a42e1199839d4..256af16fb155fc57313c7f5a84c9800bab09ec93 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <memory>
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
@@ -48,17 +49,13 @@ class TensorRTEngineTest : public ::testing::Test {
             .get());
     ctx_->PartialInitWithAllocator();
 
-    engine_ = new TensorRTEngine(10, 1 << 10);
+    TensorRTEngine::ConstructionParams params;
+    params.max_batch_size = 10;
+    params.max_workspace_size = 1 << 10;
+    engine_ = std::make_unique<TensorRTEngine>(params);
     engine_->InitNetwork();
   }
 
-  void TearDown() override {
-    if (engine_) {
-      delete engine_;
-      engine_ = nullptr;
-    }
-  }
-
   void PrepareInputOutput(const std::vector<float> &input,
                           std::vector<int> output_shape) {
     paddle::framework::TensorFromVector(input, *ctx_, &input_);
@@ -72,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
  protected:
   phi::DenseTensor input_;
   phi::DenseTensor output_;
-  TensorRTEngine *engine_;
+  std::unique_ptr<TensorRTEngine> engine_;
   phi::GPUContext *ctx_;
 };
 
@@ -111,15 +108,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
   buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
   buffers[1] = reinterpret_cast<void *>(y_gpu_data);
 
-  LOG(INFO) << "Set attr";
-  engine_->Set("test_attr", new std::string("test_attr"));
-  if (engine_->Has("test_attr")) {
-    auto attr_val = engine_->Get<std::string>("test_attr");
-    engine_->Erase("test_attr");
-  }
-  std::string *attr_key = new std::string("attr_key");
-  engine_->SetNotOwned("attr1", attr_key);
-
   LOG(INFO) << "to execute";
   engine_->Execute(1, &buffers, ctx_->stream());
 
@@ -128,8 +116,6 @@ TEST_F(TensorRTEngineTest, add_layer) {
 
   LOG(INFO) << "to checkout output";
   ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
-
-  delete attr_key;
 }
 
 TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index 0c80316f0003c7d5f39a68fad530587f5781c777..82bb7a64168b4baa4efd73c4e486b78a258f6c83 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -43,7 +43,7 @@ class TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
                     const platform::Place place);
 
   explicit TRTInt8Calibrator(const std::string& calibration_data);
-  ~TRTInt8Calibrator();
+  ~TRTInt8Calibrator() override;
 
   int getBatchSize() const TRT_NOEXCEPT override;
 
@@ -91,7 +91,7 @@ class TRTCalibratorEngine {
  */
 class TRTCalibratorEngineManager {
  public:
-  bool Has() const { return res_.size() > 0; }
+  bool Has() const { return !res_.empty(); }
   bool Has(const std::string& name) const {
     if (res_.count(name) == 0) return false;
     return res_.at(name).get() != nullptr;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 395ec613db8f4af83665bd7c70510506e48f28d6..3ebf16410fe6cd9d929801d422814b5f5c0f50a8 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -14,30 +14,34 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/data_type_transform.h"
 #ifdef PADDLE_WITH_CUDA
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "paddle/phi/kernels/cast_kernel.h"
 
 #include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
@@ -171,10 +175,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
   std::string model_opt_cache_dir_;
   bool use_static_engine_;
   phi::DataType precision_mode_;
-  std::map<std::string, std::vector<int>> min_input_shape_{};
-  std::map<std::string, std::vector<int>> max_input_shape_{};
-  std::map<std::string, std::vector<int>> opt_input_shape_{};
-  phi::DataType model_precision_{phi::DataType::FLOAT32};
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -185,7 +185,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     input_names_ = Inputs("Xs");
     max_batch_size_ = Attr<int>("max_batch_size");
     workspace_size_ = Attr<int64_t>("workspace_size");
-    device_id_ = Attr<int>("gpu_id");
+    device_id_ = Attr<int>("gpu_device_id");
     enable_int8_ = Attr<bool>("enable_int8");
     enable_fp16_ = Attr<bool>("enable_fp16");
     use_calib_mode_ = Attr<bool>("use_calib_mode");
@@ -200,43 +200,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     if (use_static_engine_) {
       model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
     }
-    model_precision_ = static_cast<phi::DataType>(Attr<int>("model_precision"));
-
-    if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
-        HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
-      std::vector<std::string> dynamic_shape_names;
-      std::vector<std::vector<int>> min_input_shapes;
-      std::vector<std::vector<int>> max_input_shapes;
-      std::vector<std::vector<int>> opt_input_shapes;
-      std::vector<int> dynamic_shape_lens;
-      dynamic_shape_names =
-          Attr<std::vector<std::string>>("dynamic_shape_names");
-      std::vector<int> min_shapes = Attr<std::vector<int>>("min_input_shape");
-      std::vector<int> max_shapes = Attr<std::vector<int>>("max_input_shape");
-      std::vector<int> opt_shapes = Attr<std::vector<int>>("opt_input_shape");
-      dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
-      int idx = 0;
-      for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
-        std::vector<int> tmp1, tmp2, tmp3;
-        for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
-          tmp1.push_back(min_shapes[idx]);
-          tmp2.push_back(max_shapes[idx]);
-          tmp3.push_back(opt_shapes[idx++]);
-        }
-        min_input_shapes.emplace_back(tmp1);
-        max_input_shapes.emplace_back(tmp2);
-        opt_input_shapes.emplace_back(tmp3);
-      }
-
-      for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
-        min_input_shape_.insert(
-            std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
-        max_input_shape_.insert(
-            std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
-        opt_input_shape_.insert(
-            std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
-      }
-    }
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
@@ -249,11 +212,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
     // calibration_mode is true represents we need to
     // generate the calibration table data.
     calibration_mode_ =
-        (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
+        (enable_int8_ && calibration_data_.empty() && use_calib_mode_);
 
     VLOG(4) << "calibration_mode: " << calibration_mode_;
-    if (enable_int8_ && calibration_data_.size()) {
-      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
+    if (enable_int8_ && !calibration_data_.empty()) {
+      calibrator_ = std::make_unique<TRTInt8Calibrator>(calibration_data_);
     }
     bool has_engine =
         inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
@@ -486,36 +449,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
         auto t_shape = phi::vectorize(t.dims());
         runtime_batch = t_shape[0];
       }
-      calib_res->calib_.reset(new TRTInt8Calibrator(
-          calib_buffers, runtime_batch, calibration_engine_key_, dev_place));
+      calib_res->calib_ = std::make_unique<TRTInt8Calibrator>(
+          calib_buffers, runtime_batch, calibration_engine_key_, dev_place);
       calib_res->thr_.reset(new std::thread([&]() {
-        std::map<std::string, std::vector<int>> min_input_shape;
-        std::map<std::string, std::vector<int>> max_input_shape;
-        std::map<std::string, std::vector<int>> opt_input_shape;
-        std::map<std::string, std::vector<int>> min_shape_tensor;
-        std::map<std::string, std::vector<int>> max_shape_tensor;
-        std::map<std::string, std::vector<int>> opt_shape_tensor;
-        if (shape_range_info_path_.size())
+        TensorRTEngine::ConstructionParams params;
+        params.max_batch_size = max_batch_size_;
+        params.max_workspace_size = workspace_size_;
+        params.precision = precision_mode_;
+        params.calibrator = calib_res->calib_.get();
+        params.device_id = dev_place.device;
+        params.with_dynamic_shape = with_dynamic_shape_;
+        if (!shape_range_info_path_.empty()) {
           inference::DeserializeShapeRangeInfo(shape_range_info_path_,
-                                               &min_input_shape,
-                                               &max_input_shape,
-                                               &opt_input_shape,
-                                               &min_shape_tensor,
-                                               &max_shape_tensor,
-                                               &opt_shape_tensor);
-
-        calib_res->engine_.reset(new TensorRTEngine(max_batch_size_,
-                                                    workspace_size_,
-                                                    precision_mode_,
-                                                    calib_res->calib_.get(),
-                                                    dev_place.device,
-                                                    with_dynamic_shape_,
-                                                    min_input_shape,
-                                                    max_input_shape,
-                                                    opt_input_shape,
-                                                    min_shape_tensor,
-                                                    max_shape_tensor,
-                                                    opt_shape_tensor));
+                                               &params.min_input_shape,
+                                               &params.max_input_shape,
+                                               &params.optim_input_shape,
+                                               &params.min_shape_tensor,
+                                               &params.max_shape_tensor,
+                                               &params.optim_shape_tensor);
+        }
+        params.context_memory_sharing = Attr<bool>("context_memory_sharing");
+        params.enable_low_precision_io = Attr<bool>("enable_low_precision_io");
+        calib_res->engine_ = std::make_unique<TensorRTEngine>(params);
+
         VLOG(3) << "start the calib trt engine thread";
         PrepareTRTEngine(scope, calib_res->engine_.get());
       }));
@@ -597,7 +553,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
       // This must be a zero dimension tensor.
       // At present, we convert it to a 1D tensor to feed them into Trt.
-      if (t_shape.size() == 0) {
+      if (t_shape.empty()) {
         PADDLE_ENFORCE_EQ(
             t.numel(),
             1UL,
@@ -669,8 +625,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
                                  t.numel() * sizeof(int),
                                  nullptr);
           } else if (t.dtype() == phi::DataType::INT64) {
-            auto int32_tensor = scope.FindVar(x + "_cast_to_INT32")
-                                    ->GetMutable<phi::DenseTensor>();
+            std::string x_t = x + "_cast_to_INT32";
+            if (scope.FindVar(x_t) == nullptr) {
+              const_cast<framework::Scope *>(&scope)->Var(x_t);
+            }
+            auto int32_tensor =
+                scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
             *int32_tensor = phi::Cast<int64_t>(
                 reinterpret_cast<const phi::GPUContext &>(dev_ctx),
                 t,
@@ -703,16 +663,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
       if (t.dtype() == phi::DataType::FLOAT32) {
         buffers[bind_index] = static_cast<void *>(t.data<float>());
       } else if (t.dtype() == phi::DataType::FLOAT64) {
-        auto fp32_tensor =
-            scope.FindVar(x + "_cast_to_FP32")->GetMutable<phi::DenseTensor>();
+        std::string x_t = x + "_cast_to_FP32";
+        if (scope.FindVar(x_t) == nullptr) {
+          const_cast<framework::Scope *>(&scope)->Var(x_t);
+        }
+        auto fp32_tensor = scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
         *fp32_tensor = phi::Cast<double>(
             reinterpret_cast<const phi::GPUContext &>(dev_ctx),
             t,
             phi::DataType::FLOAT32);
         buffers[bind_index] = static_cast<void *>(fp32_tensor->data<float>());
       } else if (t.dtype() == phi::DataType::INT64) {
-        auto int32_tensor =
-            scope.FindVar(x + "_cast_to_INT32")->GetMutable<phi::DenseTensor>();
+        std::string x_t = x + "_cast_to_INT32";
+        if (scope.FindVar(x_t) == nullptr) {
+          const_cast<framework::Scope *>(&scope)->Var(x_t);
+        }
+        auto int32_tensor = scope.FindVar(x_t)->GetMutable<phi::DenseTensor>();
         *int32_tensor = phi::Cast<int64_t>(
             reinterpret_cast<const phi::GPUContext &>(dev_ctx),
             t,
@@ -827,8 +793,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
         auto y = Outputs("Ys")[i];
         auto *fluid_v = scope.FindVar(y);
         auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
-        auto int32_tensor =
-            scope.FindVar(y + "_cast_to_INT64")->GetMutable<phi::DenseTensor>();
+        std::string y_t = y + "_cast_to_INT64";
+        if (scope.FindVar(y_t) == nullptr) {
+          const_cast<framework::Scope *>(&scope)->Var(y_t);
+        }
+        auto int32_tensor = scope.FindVar(y_t)->GetMutable<phi::DenseTensor>();
         int32_tensor->Resize(fluid_t->dims());
         dev_ctx.Alloc<int32_t>(int32_tensor);
         framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor);
@@ -840,8 +809,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
         auto y = Outputs("Ys")[i];
         auto *fluid_v = scope.FindVar(y);
         auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
-        auto fp32_tensor =
-            scope.FindVar(y + "_cast_to_FP64")->GetMutable<phi::DenseTensor>();
+        std::string y_t = y + "_cast_to_FP64";
+        if (scope.FindVar(y_t) == nullptr) {
+          const_cast<framework::Scope *>(&scope)->Var(y_t);
+        }
+        auto fp32_tensor = scope.FindVar(y_t)->GetMutable<phi::DenseTensor>();
         fp32_tensor->Resize(fluid_t->dims());
         dev_ctx.Alloc<float>(fp32_tensor);
         framework::TensorCopy(*fluid_t, dev_place, dev_ctx, fp32_tensor);
@@ -856,20 +828,92 @@ class TensorRTEngineOp : public framework::OperatorBase {
   TensorRTEngine *GetEngine(const framework::Scope &scope,
                             const platform::Place &dev_place) const {
     if (!trt_engine_) {
+      TensorRTEngine::ConstructionParams params;
+      params.max_batch_size = max_batch_size_;
+      params.max_workspace_size = workspace_size_;
+      params.precision = precision_mode_;
+      params.calibrator = calibrator_.get();
+      params.device_id = dev_place.device;
+      params.with_dynamic_shape = with_dynamic_shape_;
+      params.context_memory_sharing = Attr<bool>("context_memory_sharing");
+      params.use_dla = Attr<bool>("use_dla");
+      params.dla_core = Attr<int>("dla_core");
+      params.disable_trt_plugin_fp16 = Attr<bool>("disable_trt_plugin_fp16");
+      params.enable_low_precision_io = Attr<bool>("enable_low_precision_io");
+      params.use_inspector = Attr<bool>("use_inspector");
+
+      if (!shape_range_info_path_.empty()) {
+        inference::DeserializeShapeRangeInfo(shape_range_info_path_,
+                                             &params.min_input_shape,
+                                             &params.max_input_shape,
+                                             &params.optim_input_shape,
+                                             &params.min_shape_tensor,
+                                             &params.max_shape_tensor,
+                                             &params.optim_shape_tensor);
+      } else {
+        if (HasAttr("dynamic_shape_names") &&
+            HasAttr("min_input_shape_vector") &&
+            HasAttr("max_input_shape_vector") &&
+            HasAttr("opt_input_shape_vector")) {
+          std::vector<std::string> dynamic_shape_names;
+          std::vector<std::vector<int>> min_input_shapes;
+          std::vector<std::vector<int>> max_input_shapes;
+          std::vector<std::vector<int>> opt_input_shapes;
+          std::vector<int> dynamic_shape_lens;
+          dynamic_shape_names =
+              Attr<std::vector<std::string>>("dynamic_shape_names");
+          std::vector<int> min_shapes =
+              Attr<std::vector<int>>("min_input_shape_vector");
+          std::vector<int> max_shapes =
+              Attr<std::vector<int>>("max_input_shape_vector");
+          std::vector<int> opt_shapes =
+              Attr<std::vector<int>>("opt_input_shape_vector");
+          dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
+          int idx = 0;
+          for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
+            std::vector<int> tmp1, tmp2, tmp3;
+            for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
+              tmp1.push_back(min_shapes[idx]);
+              tmp2.push_back(max_shapes[idx]);
+              tmp3.push_back(opt_shapes[idx++]);
+            }
+            min_input_shapes.emplace_back(tmp1);
+            max_input_shapes.emplace_back(tmp2);
+            opt_input_shapes.emplace_back(tmp3);
+          }
+
+          for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
+            params.min_input_shape.insert(
+                std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
+            params.max_input_shape.insert(
+                std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
+            params.optim_input_shape.insert(
+                std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
+          }
+        }
+      }
+
       trt_engine_ =
           inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-              .Create(engine_key_ + std::to_string(predictor_id_),
-                      max_batch_size_,
-                      workspace_size_,
-                      precision_mode_,
-                      calibrator_.get(),
-                      device_id_,
-                      with_dynamic_shape_,
-                      min_input_shape_,
-                      max_input_shape_,
-                      opt_input_shape_);
-      PrepareTRTEngine(scope, trt_engine_);
+              .Create(engine_key_ + std::to_string(predictor_id_), params);
+
+      if (use_static_engine_) {
+        LOG(INFO) << "Load TRT Optimized Info from "
+                  << inference::analysis::GetTrtEngineSerializedPath(
+                         model_opt_cache_dir_, engine_key_);
+        std::string trt_engine_serialized_data =
+            inference::analysis::GetTrtEngineSerializedData(
+                model_opt_cache_dir_, engine_key_);
+        trt_engine_->Deserialize(trt_engine_serialized_data);
+      } else {
+        // This brach mainly used to ut.
+        PrepareTRTEngine(scope, trt_engine_);
+      }
     }
+    PADDLE_ENFORCE_NOT_NULL(
+        trt_engine_,
+        platform::errors::Fatal(
+            "The pointer to tensorrt engine should not be null."));
     return trt_engine_;
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 3d96361d89f043da5a6acff7ad1ed96e20bac583..f6c20061de001076b82a240fe10e463871bf82a3 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -113,7 +113,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
 
   ASSERT_EQ(block_->ops_size(), 2);
 
-  LOG(INFO) << "create tensorrt desc";
+  LOG(INFO) << "create tensorrt op desc";
   framework::OpDesc engine_op_desc(nullptr);
   engine_op_desc.SetType("tensorrt_engine");
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
@@ -138,19 +138,18 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
-  engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("gpu_device_id", device_id);
   engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
   engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
   engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime);
-  engine_op_desc.SetAttr("use_static_engine", true);
-  engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
-  engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
-  engine_op_desc.SetAttr("with_dynamic_shape", true);
-  engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 1, 1, 1});
-  engine_op_desc.SetAttr("max_input_shape", std::vector<int>{16, 16, 16, 16});
-  engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 4, 4});
-  engine_op_desc.SetAttr("model_precision",
-                         static_cast<int>(phi::DataType::FLOAT32));
+  engine_op_desc.SetAttr("use_static_engine", false);
+  engine_op_desc.SetAttr("with_dynamic_shape", false);
+  engine_op_desc.SetAttr("context_memory_sharing", true);
+  engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
+  engine_op_desc.SetAttr("enable_low_precision_io", false);
+  engine_op_desc.SetAttr("use_inspector", false);
+  engine_op_desc.SetAttr("use_dla", false);
+  engine_op_desc.SetAttr("dla_core", 0);
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -263,7 +262,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
              {output_dim, output_dim},
              {batch_size, output_dim});
 
-  LOG(INFO) << "create tensorrt desc";
+  LOG(INFO) << "create tensorrt op desc";
   framework::OpDesc engine_op_desc(nullptr);
   engine_op_desc.SetType("tensorrt_engine");
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
@@ -288,11 +287,18 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
-  engine_op_desc.SetAttr("gpu_id", device_id);
+  engine_op_desc.SetAttr("gpu_device_id", device_id);
   engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
   engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
   engine_op_desc.SetAttr("allow_build_at_runtime", false);
   engine_op_desc.SetAttr("use_static_engine", false);
+  engine_op_desc.SetAttr("with_dynamic_shape", false);
+  engine_op_desc.SetAttr("context_memory_sharing", true);
+  engine_op_desc.SetAttr("disable_trt_plugin_fp16", false);
+  engine_op_desc.SetAttr("enable_low_precision_io", false);
+  engine_op_desc.SetAttr("use_inspector", false);
+  engine_op_desc.SetAttr("use_dla", false);
+  engine_op_desc.SetAttr("dla_core", 0);
 
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
 
diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt
index 1e7fcfa6a100508d64660bbb78184ea89f2ea13e..d010e9545758c10f4de6b1e026b53b9419bcb29b 100755
--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -174,6 +174,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(test_trt_inspector PROPERTIES TIMEOUT 60)
   set_tests_properties(test_trt_inference_predictor PROPERTIES TIMEOUT 60)
   set_tests_properties(test_trt_inference_fp16_io PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_save_optimized_model_pass PROPERTIES TIMEOUT 300)
 
   if(WITH_NV_JETSON)
     set_tests_properties(
diff --git a/test/ir/inference/test_save_optimized_model_pass.py b/test/ir/inference/test_save_optimized_model_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e5a52285a5ee14e4acfb9793b87f039dc060778
--- /dev/null
+++ b/test/ir/inference/test_save_optimized_model_pass.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.inference import Config, PrecisionType, create_predictor
+from paddle.jit import to_static
+from paddle.static import InputSpec
+from paddle.vision.models import alexnet
+
+
+class TestSaveOptimizedModelPass:
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        net = alexnet(True)
+        model = to_static(
+            net, input_spec=[InputSpec(shape=[None, 3, 224, 224], name='x')]
+        )
+        paddle.jit.save(
+            model, os.path.join(self.temp_dir.name, 'alexnet/inference')
+        )
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def get_baseline(self):
+        predictor = self.init_predictor(save_optimized_model=True)
+
+        inputs = [
+            paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32))
+        ]
+
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def get_test_output(self):
+        predictor = self.init_predictor(save_optimized_model=False)
+
+        inputs = [
+            paddle.to_tensor(0.1 * np.ones([1, 3, 224, 224]).astype(np.float32))
+        ]
+
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def test_output(self):
+        if paddle.is_compiled_with_cuda():
+            baseline = self.get_baseline()
+            test_output = self.get_test_output()
+
+            np.testing.assert_allclose(
+                baseline.numpy().flatten(),
+                test_output.numpy().flatten(),
+            )
+
+
+class TestSaveOptimizedModelPassWithGPU(
+    TestSaveOptimizedModelPass, unittest.TestCase
+):
+    def init_predictor(self, save_optimized_model: bool):
+        if save_optimized_model is True:
+            config = Config(
+                os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'),
+                os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'),
+            )
+            config.enable_use_gpu(256, 0, PrecisionType.Half)
+            config.enable_memory_optim()
+            config.switch_ir_optim(True)
+            config.set_optim_cache_dir(
+                os.path.join(self.temp_dir.name, 'alexnet')
+            )
+            config.enable_save_optim_model(True)
+        else:
+            config = Config(
+                os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'),
+                os.path.join(
+                    self.temp_dir.name, 'alexnet/_optimized.pdiparams'
+                ),
+            )
+            config.enable_use_gpu(256, 0, PrecisionType.Half)
+            config.enable_memory_optim()
+            config.switch_ir_optim(False)
+
+        predictor = create_predictor(config)
+        return predictor
+
+
+class TestSaveOptimizedModelPassWithTRT(
+    TestSaveOptimizedModelPass, unittest.TestCase
+):
+    def init_predictor(self, save_optimized_model: bool):
+        if save_optimized_model is True:
+            config = Config(
+                os.path.join(self.temp_dir.name, 'alexnet/inference.pdmodel'),
+                os.path.join(self.temp_dir.name, 'alexnet/inference.pdiparams'),
+            )
+            config.enable_use_gpu(256, 0)
+            config.enable_tensorrt_engine(
+                workspace_size=1 << 30,
+                max_batch_size=1,
+                min_subgraph_size=3,
+                precision_mode=PrecisionType.Half,
+                use_static=True,
+                use_calib_mode=False,
+            )
+            config.set_trt_dynamic_shape_info(
+                {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
+                {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
+                {"x": [1, 3, 224, 224], "flatten_1.tmp_0": [1, 9216]},
+            )
+            config.exp_disable_tensorrt_ops(["flatten_contiguous_range"])
+            config.enable_memory_optim()
+            config.switch_ir_optim(True)
+            config.set_optim_cache_dir(
+                os.path.join(self.temp_dir.name, 'alexnet')
+            )
+            config.enable_save_optim_model(True)
+        else:
+            config = Config(
+                os.path.join(self.temp_dir.name, 'alexnet/_optimized.pdmodel'),
+                os.path.join(
+                    self.temp_dir.name, 'alexnet/_optimized.pdiparams'
+                ),
+            )
+            config.enable_use_gpu(256, 0)
+            config.enable_tensorrt_engine(
+                workspace_size=1 << 30,
+                max_batch_size=1,
+                min_subgraph_size=3,
+                precision_mode=PrecisionType.Half,
+                use_static=True,
+                use_calib_mode=False,
+            )
+            config.enable_memory_optim()
+            config.switch_ir_optim(False)
+
+        predictor = create_predictor(config)
+        return predictor
+
+
+if __name__ == '__main__':
+    unittest.main()