From 173b39bb5703c297ae89c6ef442f634c56f2f2bf Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 22 Sep 2022 14:46:28 +0800
Subject: [PATCH] TensorRT engine context memory sharing (#45842)

---
 paddle/fluid/inference/analysis/argument.h    |   1 +
 .../inference/analysis/ir_pass_manager.cc     |   3 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |   3 +-
 paddle/fluid/inference/api/analysis_config.cc |  31 +++-
 .../fluid/inference/api/analysis_predictor.cc |   8 +
 .../inference/api/paddle_analysis_config.h    |  16 +-
 paddle/fluid/inference/tensorrt/engine.cc     |  99 ++++++++++
 paddle/fluid/inference/tensorrt/engine.h      | 171 ++++++++----------
 .../operators/tensorrt/tensorrt_engine_op.h   |   7 +-
 9 files changed, 227 insertions(+), 112 deletions(-)
 mode change 100755 => 100644 paddle/fluid/inference/api/analysis_predictor.cc
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index b0ed905bfc6..871718eff14 100755
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -314,6 +314,7 @@ struct Argument {
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(trt_engine_memory_sharing, TrtEngineMemorySharing, bool);
 
   // Indicate which kind of sort algorithm is used for operators, the memory
   // optimization relays on the sort algorithm.
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index f86a22e3db9..e1fe856e3c0 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_calib_mode", new bool(use_calib_mode));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
-
+      pass->Set("context_memory_sharing",
+                new bool(argument->trt_engine_memory_sharing()));
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
       std::string optim_cache_dir = argument->optim_cache_dir();
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 60e0864a9be..05d66d7fe27 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
   // those parameter already exist in trt, and should not have another copy in
   // fluid.
   std::vector<std::string> repetitive_params;
-
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
       CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
-
       std::unordered_set<const Node *> nodes2remove(
           framework::ir::Agent(node).subgraph()->begin(),
           framework::ir::Agent(node).subgraph()->end());
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetWithErnie(
       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
       graph->Has(framework::ir::kMultiheadMatmulPass));
+  trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index f3fbf1c344d..97f6d81e592 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(collect_shape_range_info_);
   CP_MEMBER(shape_range_info_path_);
   CP_MEMBER(trt_use_inspector_);
+  CP_MEMBER(trt_engine_memory_sharing_);
   // Dlnne related
   CP_MEMBER(use_dlnne_);
   CP_MEMBER(dlnne_min_subgraph_size_);
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
   }
 
   use_tensorrt_ = true;
+#if PADDLE_WITH_TENSORRT
+  // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
+  // when trt version less than 7.2,
+  // createExecutionContextWithoutDeviceMemory() has bug.
+  // so, we cannot enable engine context memory sharing.
+#if IS_TRT_VERSION_GE(7200)
+  trt_engine_memory_sharing_ = true;
+#else
+  LOG(WARNING)
+      << "TensorRT engine context memory sharing needs version 7.2 and after.";
+  trt_engine_memory_sharing_ = false;
+#endif
+#endif
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
 
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
-  auto info = SerializeInfoCache();
+  auto &&info = SerializeInfoCache();
   if (info == serialized_info_cache_) return;
 
   // Transfer pass_builder and copy the existing compatible passes.
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << trt_dla_core_;
 
   ss << enable_memory_optim_;
+  ss << trt_engine_memory_sharing_;
 
   ss << use_mkldnn_;
   ss << mkldnn_cache_capacity_;
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
   return enable_memory_optim_;
 }
 
+bool AnalysisConfig::trt_engine_memory_sharing() const {
+  return trt_engine_memory_sharing_;
+}
+
 void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                     size_t prog_buffer_size,
                                     const char *param_buffer,
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
       if (trt_use_dla_) {
         os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
       }
+      os.InsertRow({"trt_engine_memory_sharing",
+                    trt_engine_memory_sharing_ ? "true" : "false"});
 #endif
     }
   }
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
   shape_range_info_path_ = shape_range_info_path;
 }
 
-const std::string &AnalysisConfig::shape_range_info_path() {
+const std::string &AnalysisConfig::shape_range_info_path() const {
   return shape_range_info_path_;
 }
 
-bool AnalysisConfig::shape_range_info_collected() {
+bool AnalysisConfig::shape_range_info_collected() const {
   return collect_shape_range_info_;
 }
 
@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
   trt_tuned_dynamic_shape_ = true;
 }
 
-bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
+bool AnalysisConfig::tuned_tensorrt_dynamic_shape() const {
   return trt_tuned_dynamic_shape_;
 }
 
-bool AnalysisConfig::trt_allow_build_at_runtime() {
+bool AnalysisConfig::trt_allow_build_at_runtime() const {
   return trt_allow_build_at_runtime_;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
old mode 100755
new mode 100644
index 33b3da0717b..ae34fd52341
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtAllowBuildAtRuntime(
         config_.trt_allow_build_at_runtime());
     argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
+    argument_.SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
   }
 
   if (config_.dlnne_enabled()) {
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
     memory::Release(place_);
   }
   device_contexts_.clear();
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.trt_engine_memory_sharing()) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .releaseContextMemory(predictor_id_);
+  }
+#endif
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7bf5dc2cfe3..5f75636d854 100755
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
   ///
+  /// \brief A boolean state telling whether the tensorrt engine memory sharing
+  /// is activated.
+  ///
+  /// \return bool Whether the tensorrt engine memory sharing is activated.
+  ///
+  bool trt_engine_memory_sharing() const;
+  ///
   /// \brief  Get the TensorRT engine precision.
   ///
   /// \return Precision Get the TensorRT engine precision.
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \brief A boolean state telling whether to use tuned tensorrt dynamic
   /// shape.
   ///
-  bool tuned_tensorrt_dynamic_shape();
+  bool tuned_tensorrt_dynamic_shape() const;
 
   ///
   /// \brief A boolean state telling whether to allow building trt engine at
   /// runtime.
   ///
-  bool trt_allow_build_at_runtime();
+  bool trt_allow_build_at_runtime() const;
 
   ///
   /// \brief Set execution stream. If not set a stream will be created
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   /// \return the shape info path.
   ///
-  const std::string& shape_range_info_path();
+  const std::string& shape_range_info_path() const;
 
   ///
   /// \brief A boolean state telling whether to collect shape info.
   ///
   /// \return bool Whether to collect shape info.
   ///
-  bool shape_range_info_collected();
+  bool shape_range_info_collected() const;
 
   ///
   /// \brief Prevent ops running in Paddle-TRT
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
 
   // memory reuse related.
   bool enable_memory_optim_{false};
+  bool trt_engine_memory_sharing_{false};
 
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 0cf1d6352c3..9a0bbd0ba0c 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
     optim_profiles_[i] = infer_builder_->createOptimizationProfile();
 }
 
+nvinfer1::IExecutionContext *TensorRTEngine::context() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "You should build engine first and then set the context."));
+    // We may see trt warning: Profile 0 has been chosen by another
+    // IExecutionContext...
+    // It's ok. We will set it later.
+    nvinfer1::IExecutionContext *infer_context{nullptr};
+    if (context_memory_sharing_) {
+      infer_context =
+          infer_engine_->createExecutionContextWithoutDeviceMemory();
+    } else {
+      infer_context = infer_engine_->createExecutionContext();
+    }
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_context,
+        platform::errors::InvalidArgument(
+            "TensorRT engine can not build execution context."));
+    if (with_dynamic_shape_) {
+      // need new profile if it's not the first
+      if (cur_profile_num_ > 0) {
+        infer_context->setOptimizationProfile(cur_profile_num_);
+      }
+      profile_index_[predictor_id_per_thread] = cur_profile_num_;
+      ++cur_profile_num_;
+    }
+    infer_context_[predictor_id_per_thread].reset(infer_context);
+  }
+  return infer_context_[predictor_id_per_thread].get();
+}
+
 void TensorRTEngine::Execute(int batch_size,
                              std::vector<void *> *buffers,
                              cudaStream_t stream) {
   freshDeviceId();
   auto infer_context = context();
+  if (context_memory_sharing_) {
+    void *context_memory{nullptr};
+    context_memory =
+        inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+            .getContextMemory(
+                predictor_id_per_thread,
+                phi::GPUPlace(device_id_),
+                phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    infer_context->setDeviceMemory(context_memory);
+  }
   if (!with_dynamic_shape()) {
     infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
   } else {
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
     infer_context_.clear();
     cur_profile_num_ = 0;
   }
+  // for engine context memory sharing
+  if (context_memory_sharing_) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+                                 predictor_id_per_thread);
+  }
 
   GetEngineInfo();
 }
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
   return &itensor_map_;
 }
 
+void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
+  freshDeviceId();
+  infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+
+  if (use_dla_) {
+    if (precision_ != AnalysisConfig::Precision::kInt8 &&
+        precision_ != AnalysisConfig::Precision::kHalf) {
+      LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
+                      "set float32, so DLA is not used.";
+    } else if (runtime->getNbDLACores() == 0) {
+      LOG(WARNING)
+          << "TensorRT DLA is set by config, but your device does not have "
+             "DLA, so DLA is not used.";
+    } else {
+      if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
+        dla_core_ = 0;
+        LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
+                     << runtime->getNbDLACores() << ", but got " << dla_core_
+                     << ", so use use 0 as default.";
+      }
+      runtime->setDLACore(dla_core_);
+      LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
+                << dla_core_;
+    }
+  }
+
+  infer_engine_.reset(runtime->deserializeCudaEngine(
+      engine_serialized_data.c_str(), engine_serialized_data.size()));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_,
+      platform::errors::Fatal(
+          "Building TRT cuda engine failed when deserializing engine info. "
+          "Please check:\n1. Your TRT serialization is generated and loaded "
+          "on the same GPU architecture;\n2. The Paddle Inference version of "
+          "generating serialization file and doing inference are "
+          "consistent."));
+
+  binding_num_ = infer_engine_->getNbBindings();
+  // for engine context memory sharing
+  if (context_memory_sharing_) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+                                 predictor_id_per_thread);
+  }
+
+  GetEngineInfo();
+}
+
 void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
   runtime_batch_ = batch_size;
 }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 209f297a066..034f417ff07 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <NvInfer.h>
 
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -37,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/stream.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
 /*
  * TensorRT Engine.
  *
- * There are two alternative ways to use it, one is  to build from a paddle
+ * There are two alternative ways to use it, one is to build from a paddle
  * protobuf model, another way is to manually construct the network.
  */
 class TensorRTEngine {
@@ -287,51 +290,10 @@ class TensorRTEngine {
   std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
-  nvinfer1::IExecutionContext* context() {
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
-      PADDLE_ENFORCE_NOT_NULL(
-          infer_engine_,
-          platform::errors::InvalidArgument(
-              "You should build engine first and then set the context."));
-      // We may see trt warning: Profile 0 has been chosen by another
-      // IExecutionContext...
-      // It's ok. We will set it later.
-      infer_context_[predictor_id_per_thread].reset(
-          infer_engine_->createExecutionContext());
-      if (with_dynamic_shape_) {
-        // need new profile if it's not the first
-        if (cur_profile_num_ > 0) {
-          infer_context_[predictor_id_per_thread]->setOptimizationProfile(
-              cur_profile_num_);
-        }
-        profile_index_[predictor_id_per_thread] = cur_profile_num_;
-        ++cur_profile_num_;
-      }
-    }
-    return infer_context_[predictor_id_per_thread].get();
-  }
+  nvinfer1::IExecutionContext* context();
 
   int GetProfileIndex() {
     if (max_profile_num_ > 1) {
-#ifndef PADDLE_WITH_TESTING
-      PADDLE_ENFORCE_GT(
-          predictor_id_per_thread,
-          -1,
-          platform::errors::InvalidArgument(
-              "thread local var predictor_id_per_thread must be "
-              "initialized to >= 0, but now predictor_id_per_thread = %d",
-              predictor_id_per_thread));
-#endif
       std::unique_lock<std::mutex> lock(mutex_);
       return profile_index_[predictor_id_per_thread];
     } else {
@@ -350,15 +312,6 @@ class TensorRTEngine {
         infer_engine_,
         platform::errors::InvalidArgument(
             "You should build engine first and then set the context."));
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
     std::unique_lock<std::mutex> lock(mutex_);
     infer_context_[predictor_id_per_thread].reset(nullptr);
     infer_context_.erase(predictor_id_per_thread);
@@ -380,47 +333,7 @@ class TensorRTEngine {
     return ihost_memory_.get();
   }
 
-  void Deserialize(const std::string& engine_serialized_data) {
-    freshDeviceId();
-    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-
-    if (use_dla_) {
-      if (precision_ != AnalysisConfig::Precision::kInt8 &&
-          precision_ != AnalysisConfig::Precision::kHalf) {
-        LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
-                        "set float32, so DLA is not used.";
-      } else if (runtime->getNbDLACores() == 0) {
-        LOG(WARNING)
-            << "TensorRT DLA is set by config, but your device does not have "
-               "DLA, so DLA is not used.";
-      } else {
-        if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
-          dla_core_ = 0;
-          LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
-                       << runtime->getNbDLACores() << ", but got " << dla_core_
-                       << ", so use use 0 as default.";
-        }
-        runtime->setDLACore(dla_core_);
-        LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
-                  << dla_core_;
-      }
-    }
-
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size()));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        infer_engine_,
-        platform::errors::Fatal(
-            "Building TRT cuda engine failed when deserializing engine info. "
-            "Please check:\n1. Your TRT serialization is generated and loaded "
-            "on the same GPU architecture;\n2. The Paddle Inference version of "
-            "generating serialization file and doing inference are "
-            "consistent."));
-
-    binding_num_ = infer_engine_->getNbBindings();
-    GetEngineInfo();
-  }
+  void Deserialize(const std::string& engine_serialized_data);
 
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
@@ -694,6 +607,10 @@ class TensorRTEngine {
   void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
   void SetScope(const framework::Scope& scope) { scope_ = &scope; }
 
+  void SetContextMemorySharing(bool context_memory_sharing) {
+    context_memory_sharing_ = context_memory_sharing;
+  }
+
  private:
   // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
   // ensure that the thread is associated with the correct device by calling
@@ -714,6 +631,9 @@ class TensorRTEngine {
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
 
+  // use for engine context memory sharing
+  bool context_memory_sharing_{false};
+
   int device_id_;
   int max_profile_num_{1};
   int cur_profile_num_{0};
@@ -791,14 +711,23 @@ class TensorRTEngine {
   engine__->network()->add##layer__(__VA_ARGS__)
 
 class TRTEngineManager {
+  using PredictorID = int;
+  using AllocationPtr = phi::Allocator::AllocationPtr;
+
  public:
-  bool Empty() const { return engines_.size() == 0; }
+  bool Empty() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return engines_.size() == 0;
+  }
+
   bool Has(const std::string& name) const {
+    std::lock_guard<std::mutex> lock(mutex_);
     if (engines_.count(name) == 0) return false;
     return engines_.at(name).get() != nullptr;
   }
 
   TensorRTEngine* Get(const std::string& name) const {
+    std::lock_guard<std::mutex> lock(mutex_);
     return engines_.at(name).get();
   }
 
@@ -826,17 +755,21 @@ class TRTEngineManager {
                                  disable_trt_plugin_fp16,
                                  model_precision,
                                  logger);
+    std::lock_guard<std::mutex> lock(mutex_);
     engines_[name].reset(p);
     return p;
   }
 
   void DeleteAll() {
+    std::lock_guard<std::mutex> lock(mutex_);
     for (auto& item : engines_) {
       item.second.reset(nullptr);
     }
+    engines_.clear();
   }
 
   void DeleteKey(const std::string& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
     auto iter = engines_.find(key);
     if (iter != engines_.end()) {
       iter->second.reset(nullptr);
@@ -844,7 +777,57 @@ class TRTEngineManager {
     }
   }
 
+  void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
+    bool size_updated{false};
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (max_ctx_mem_size_ < mem_size) {
+        max_ctx_mem_size_ = mem_size;
+        size_updated = true;
+      }
+    }
+
+    if (size_updated) {
+      releaseContextMemory(predictor_id);
+    }
+  }
+
+  void* getContextMemory(PredictorID predictor_id,
+                         const phi::GPUPlace& place,
+                         const phi::Stream& stream) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    static auto alignment = getAlignmentSize(place);
+    if (context_memorys_.count(predictor_id) == 0) {
+      auto context_memory =
+          memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
+      // context_memory_[predictor_id].reset(context_memory.release());
+      context_memorys_[predictor_id] = std::move(context_memory);
+    }
+    return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
+  }
+
+  void releaseContextMemory(PredictorID predictor_id) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (context_memorys_.count(predictor_id)) {
+      context_memorys_[predictor_id].reset(nullptr);
+      context_memorys_.erase(predictor_id);
+    }
+  }
+
  private:
+  size_t getAlignmentSize(const phi::GPUPlace& place) {
+    const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
+    return prop.textureAlignment;
+  }
+
+  void* getAlignedMemory(void* addr, size_t alignment) {
+    return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
+  }
+
+  mutable std::mutex mutex_;
+  size_t max_ctx_mem_size_{0};
+  std::unordered_map<PredictorID, AllocationPtr> context_memorys_;
   std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0f8a3d12062..a795a1aadb5 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
 
-    int num_inputs = 0;
-
-    num_inputs += runtime_input_names_.size();
-    //  const int num_bindings = num_inputs + Outputs("Ys").size();
-    //  std::vector<void *> buffers(num_bindings);
-    // This method returns the total over all profiles.
+    // Get the total over all profiles
     const int num_bindings = engine->GetNbBindings();
     std::vector<void *> buffers(num_bindings, nullptr);
 
-- 
GitLab