TensorRT engine context memory sharing (#45842)

173b39bb · Yuanle Liu · GitHub · d772166c · 173b39bb · 173b39bb
9 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -314,6 +314,7 @@ struct Argument {

  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(trt_engine_memory_sharing, TrtEngineMemorySharing, bool);

  // Indicate which kind of sort algorithm is used for operators, the memory
  // optimization relays on the sort algorithm.

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("use_calib_mode", new bool(use_calib_mode));
      pass->Set("precision_mode",
                new AnalysisConfig::Precision(precision_mode));
-
+      pass->Set("context_memory_sharing",
+                new bool(argument->trt_engine_memory_sharing()));
      bool use_static_engine = argument->tensorrt_use_static_engine();
      bool model_from_memory = argument->model_from_memory();
      std::string optim_cache_dir = argument->optim_cache_dir();

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
  // those parameter already exist in trt, and should not have another copy in
  // fluid.
  std::vector<std::string> repetitive_params;
-
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
      CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
-
      std::unordered_set<const Node *> nodes2remove(
          framework::ir::Agent(node).subgraph()->begin(),
          framework::ir::Agent(node).subgraph()->end());
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  trt_engine->SetWithErnie(
      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
      graph->Has(framework::ir::kMultiheadMatmulPass));
+  trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));

  if (use_static_engine) {
    trt_engine_serialized_data = GetTrtEngineSerializedData(

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(collect_shape_range_info_);
  CP_MEMBER(shape_range_info_path_);
  CP_MEMBER(trt_use_inspector_);
+  CP_MEMBER(trt_engine_memory_sharing_);
  // Dlnne related
  CP_MEMBER(use_dlnne_);
  CP_MEMBER(dlnne_min_subgraph_size_);
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
  }

  use_tensorrt_ = true;
+#if PADDLE_WITH_TENSORRT
+  // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
+  // when trt version less than 7.2,
+  // createExecutionContextWithoutDeviceMemory() has bug.
+  // so, we cannot enable engine context memory sharing.
+#if IS_TRT_VERSION_GE(7200)
+  trt_engine_memory_sharing_ = true;
+#else
+  LOG(WARNING)
+      << "TensorRT engine context memory sharing needs version 7.2 and after.";
+  trt_engine_memory_sharing_ = false;
+#endif
+#endif
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }

 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
-  auto info = SerializeInfoCache();
+  auto &&info = SerializeInfoCache();
  if (info == serialized_info_cache_) return;

  // Transfer pass_builder and copy the existing compatible passes.
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << trt_dla_core_;

  ss << enable_memory_optim_;
+  ss << trt_engine_memory_sharing_;

  ss << use_mkldnn_;
  ss << mkldnn_cache_capacity_;
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
  return enable_memory_optim_;
 }

+bool AnalysisConfig::trt_engine_memory_sharing() const {
+  return trt_engine_memory_sharing_;
+}
+
 void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                    size_t prog_buffer_size,
                                    const char *param_buffer,
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
      if (trt_use_dla_) {
        os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
      }
+      os.InsertRow({"trt_engine_memory_sharing",
+                    trt_engine_memory_sharing_ ? "true" : "false"});
 #endif
    }
  }
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
  shape_range_info_path_ = shape_range_info_path;
 }

-const std::string &AnalysisConfig::shape_range_info_path() {
+const std::string &AnalysisConfig::shape_range_info_path() const {
  return shape_range_info_path_;
 }

-bool AnalysisConfig::shape_range_info_collected() {
+bool AnalysisConfig::shape_range_info_collected() const {
  return collect_shape_range_info_;
 }

@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
  trt_tuned_dynamic_shape_ = true;
 }

-bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
+bool AnalysisConfig::tuned_tensorrt_dynamic_shape() const {
  return trt_tuned_dynamic_shape_;
 }

-bool AnalysisConfig::trt_allow_build_at_runtime() {
+bool AnalysisConfig::trt_allow_build_at_runtime() const {
  return trt_allow_build_at_runtime_;
 }


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetTensorRtAllowBuildAtRuntime(
        config_.trt_allow_build_at_runtime());
    argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
+    argument_.SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
  }

  if (config_.dlnne_enabled()) {
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
    memory::Release(place_);
  }
  device_contexts_.clear();
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.trt_engine_memory_sharing()) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .releaseContextMemory(predictor_id_);
+  }
+#endif
 }

 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
  ///
+  /// \brief A boolean state telling whether the tensorrt engine memory sharing
+  /// is activated.
+  ///
+  /// \return bool Whether the tensorrt engine memory sharing is activated.
+  ///
+  bool trt_engine_memory_sharing() const;
+  ///
  /// \brief  Get the TensorRT engine precision.
  ///
  /// \return Precision Get the TensorRT engine precision.
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \brief A boolean state telling whether to use tuned tensorrt dynamic
  /// shape.
  ///
-  bool tuned_tensorrt_dynamic_shape();
+  bool tuned_tensorrt_dynamic_shape() const;

  ///
  /// \brief A boolean state telling whether to allow building trt engine at
  /// runtime.
  ///
-  bool trt_allow_build_at_runtime();
+  bool trt_allow_build_at_runtime() const;

  ///
  /// \brief Set execution stream. If not set a stream will be created
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  /// \return the shape info path.
  ///
-  const std::string& shape_range_info_path();
+  const std::string& shape_range_info_path() const;

  ///
  /// \brief A boolean state telling whether to collect shape info.
  ///
  /// \return bool Whether to collect shape info.
  ///
-  bool shape_range_info_collected();
+  bool shape_range_info_collected() const;

  ///
  /// \brief Prevent ops running in Paddle-TRT
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {

  // memory reuse related.
  bool enable_memory_optim_{false};
+  bool trt_engine_memory_sharing_{false};

  bool use_mkldnn_{false};
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
    optim_profiles_[i] = infer_builder_->createOptimizationProfile();
 }

+nvinfer1::IExecutionContext *TensorRTEngine::context() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "You should build engine first and then set the context."));
+    // We may see trt warning: Profile 0 has been chosen by another
+    // IExecutionContext...
+    // It's ok. We will set it later.
+    nvinfer1::IExecutionContext *infer_context{nullptr};
+    if (context_memory_sharing_) {
+      infer_context =
+          infer_engine_->createExecutionContextWithoutDeviceMemory();
+    } else {
+      infer_context = infer_engine_->createExecutionContext();
+    }
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_context,
+        platform::errors::InvalidArgument(
+            "TensorRT engine can not build execution context."));
+    if (with_dynamic_shape_) {
+      // need new profile if it's not the first
+      if (cur_profile_num_ > 0) {
+        infer_context->setOptimizationProfile(cur_profile_num_);
+      }
+      profile_index_[predictor_id_per_thread] = cur_profile_num_;
+      ++cur_profile_num_;
+    }
+    infer_context_[predictor_id_per_thread].reset(infer_context);
+  }
+  return infer_context_[predictor_id_per_thread].get();
+}
+
 void TensorRTEngine::Execute(int batch_size,
                             std::vector<void *> *buffers,
                             cudaStream_t stream) {
  freshDeviceId();
  auto infer_context = context();
+  if (context_memory_sharing_) {
+    void *context_memory{nullptr};
+    context_memory =
+        inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+            .getContextMemory(
+                predictor_id_per_thread,
+                phi::GPUPlace(device_id_),
+                phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    infer_context->setDeviceMemory(context_memory);
+  }
  if (!with_dynamic_shape()) {
    infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
  } else {
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
    infer_context_.clear();
    cur_profile_num_ = 0;
  }
+  // for engine context memory sharing
+  if (context_memory_sharing_) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+                                 predictor_id_per_thread);
+  }

  GetEngineInfo();
 }
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
  return &itensor_map_;
 }

+void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
+  freshDeviceId();
+  infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+
+  if (use_dla_) {
+    if (precision_ != AnalysisConfig::Precision::kInt8 &&
+        precision_ != AnalysisConfig::Precision::kHalf) {
+      LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
+                      "set float32, so DLA is not used.";
+    } else if (runtime->getNbDLACores() == 0) {
+      LOG(WARNING)
+          << "TensorRT DLA is set by config, but your device does not have "
+             "DLA, so DLA is not used.";
+    } else {
+      if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
+        dla_core_ = 0;
+        LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
+                     << runtime->getNbDLACores() << ", but got " << dla_core_
+                     << ", so use use 0 as default.";
+      }
+      runtime->setDLACore(dla_core_);
+      LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
+                << dla_core_;
+    }
+  }
+
+  infer_engine_.reset(runtime->deserializeCudaEngine(
+      engine_serialized_data.c_str(), engine_serialized_data.size()));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_,
+      platform::errors::Fatal(
+          "Building TRT cuda engine failed when deserializing engine info. "
+          "Please check:\n1. Your TRT serialization is generated and loaded "
+          "on the same GPU architecture;\n2. The Paddle Inference version of "
+          "generating serialization file and doing inference are "
+          "consistent."));
+
+  binding_num_ = infer_engine_->getNbBindings();
+  // for engine context memory sharing
+  if (context_memory_sharing_) {
+    inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+        .updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
+                                 predictor_id_per_thread);
+  }
+
+  GetEngineInfo();
+}
+
 void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <NvInfer.h>

+#include <cstdint>
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -37,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/stream.h"
 #include "paddle/utils/any.h"

 namespace paddle {
@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
 /*
 * TensorRT Engine.
 *
- * There are two alternative ways to use it, one is  to build from a paddle
+ * There are two alternative ways to use it, one is to build from a paddle
 * protobuf model, another way is to manually construct the network.
 */
 class TensorRTEngine {
@@ -287,51 +290,10 @@ class TensorRTEngine {
  std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();

  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
-  nvinfer1::IExecutionContext* context() {
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
-      PADDLE_ENFORCE_NOT_NULL(
-          infer_engine_,
-          platform::errors::InvalidArgument(
-              "You should build engine first and then set the context."));
-      // We may see trt warning: Profile 0 has been chosen by another
-      // IExecutionContext...
-      // It's ok. We will set it later.
-      infer_context_[predictor_id_per_thread].reset(
-          infer_engine_->createExecutionContext());
-      if (with_dynamic_shape_) {
-        // need new profile if it's not the first
-        if (cur_profile_num_ > 0) {
-          infer_context_[predictor_id_per_thread]->setOptimizationProfile(
-              cur_profile_num_);
-        }
-        profile_index_[predictor_id_per_thread] = cur_profile_num_;
-        ++cur_profile_num_;
-      }
-    }
-    return infer_context_[predictor_id_per_thread].get();
-  }
+  nvinfer1::IExecutionContext* context();

  int GetProfileIndex() {
    if (max_profile_num_ > 1) {
-#ifndef PADDLE_WITH_TESTING
-      PADDLE_ENFORCE_GT(
-          predictor_id_per_thread,
-          -1,
-          platform::errors::InvalidArgument(
-              "thread local var predictor_id_per_thread must be "
-              "initialized to >= 0, but now predictor_id_per_thread = %d",
-              predictor_id_per_thread));
-#endif
      std::unique_lock<std::mutex> lock(mutex_);
      return profile_index_[predictor_id_per_thread];
    } else {
@@ -350,15 +312,6 @@ class TensorRTEngine {
        infer_engine_,
        platform::errors::InvalidArgument(
            "You should build engine first and then set the context."));
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
    std::unique_lock<std::mutex> lock(mutex_);
    infer_context_[predictor_id_per_thread].reset(nullptr);
    infer_context_.erase(predictor_id_per_thread);
@@ -380,47 +333,7 @@ class TensorRTEngine {
    return ihost_memory_.get();
  }

-  void Deserialize(const std::string& engine_serialized_data) {
-    freshDeviceId();
-    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-
-    if (use_dla_) {
-      if (precision_ != AnalysisConfig::Precision::kInt8 &&
-          precision_ != AnalysisConfig::Precision::kHalf) {
-        LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
-                        "set float32, so DLA is not used.";
-      } else if (runtime->getNbDLACores() == 0) {
-        LOG(WARNING)
-            << "TensorRT DLA is set by config, but your device does not have "
-               "DLA, so DLA is not used.";
-      } else {
-        if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
-          dla_core_ = 0;
-          LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
-                       << runtime->getNbDLACores() << ", but got " << dla_core_
-                       << ", so use use 0 as default.";
-        }
-        runtime->setDLACore(dla_core_);
-        LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
-                  << dla_core_;
-      }
-    }
-
-    infer_engine_.reset(runtime->deserializeCudaEngine(
-        engine_serialized_data.c_str(), engine_serialized_data.size()));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        infer_engine_,
-        platform::errors::Fatal(
-            "Building TRT cuda engine failed when deserializing engine info. "
-            "Please check:\n1. Your TRT serialization is generated and loaded "
-            "on the same GPU architecture;\n2. The Paddle Inference version of "
-            "generating serialization file and doing inference are "
-            "consistent."));
-
-    binding_num_ = infer_engine_->getNbBindings();
-    GetEngineInfo();
-  }
+  void Deserialize(const std::string& engine_serialized_data);

  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
@@ -694,6 +607,10 @@ class TensorRTEngine {
  void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
  void SetScope(const framework::Scope& scope) { scope_ = &scope; }

+  void SetContextMemorySharing(bool context_memory_sharing) {
+    context_memory_sharing_ = context_memory_sharing;
+  }
+
 private:
  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
  // ensure that the thread is associated with the correct device by calling
@@ -714,6 +631,9 @@ class TensorRTEngine {
  // batch size of the current data, will be updated each Executation.
  int batch_size_{-1};

+  // use for engine context memory sharing
+  bool context_memory_sharing_{false};
+
  int device_id_;
  int max_profile_num_{1};
  int cur_profile_num_{0};
@@ -791,14 +711,23 @@ class TensorRTEngine {
  engine__->network()->add##layer__(__VA_ARGS__)

 class TRTEngineManager {
+  using PredictorID = int;
+  using AllocationPtr = phi::Allocator::AllocationPtr;
+
 public:
-  bool Empty() const { return engines_.size() == 0; }
+  bool Empty() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return engines_.size() == 0;
+  }
+
  bool Has(const std::string& name) const {
+    std::lock_guard<std::mutex> lock(mutex_);
    if (engines_.count(name) == 0) return false;
    return engines_.at(name).get() != nullptr;
  }

  TensorRTEngine* Get(const std::string& name) const {
+    std::lock_guard<std::mutex> lock(mutex_);
    return engines_.at(name).get();
  }

@@ -826,17 +755,21 @@ class TRTEngineManager {
                                 disable_trt_plugin_fp16,
                                 model_precision,
                                 logger);
+    std::lock_guard<std::mutex> lock(mutex_);
    engines_[name].reset(p);
    return p;
  }

  void DeleteAll() {
+    std::lock_guard<std::mutex> lock(mutex_);
    for (auto& item : engines_) {
      item.second.reset(nullptr);
    }
+    engines_.clear();
  }

  void DeleteKey(const std::string& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
    auto iter = engines_.find(key);
    if (iter != engines_.end()) {
      iter->second.reset(nullptr);
@@ -844,7 +777,57 @@ class TRTEngineManager {
    }
  }

+  void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
+    bool size_updated{false};
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (max_ctx_mem_size_ < mem_size) {
+        max_ctx_mem_size_ = mem_size;
+        size_updated = true;
+      }
+    }
+
+    if (size_updated) {
+      releaseContextMemory(predictor_id);
+    }
+  }
+
+  void* getContextMemory(PredictorID predictor_id,
+                         const phi::GPUPlace& place,
+                         const phi::Stream& stream) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    static auto alignment = getAlignmentSize(place);
+    if (context_memorys_.count(predictor_id) == 0) {
+      auto context_memory =
+          memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
+      // context_memory_[predictor_id].reset(context_memory.release());
+      context_memorys_[predictor_id] = std::move(context_memory);
+    }
+    return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
+  }
+
+  void releaseContextMemory(PredictorID predictor_id) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (context_memorys_.count(predictor_id)) {
+      context_memorys_[predictor_id].reset(nullptr);
+      context_memorys_.erase(predictor_id);
+    }
+  }
+
 private:
+  size_t getAlignmentSize(const phi::GPUPlace& place) {
+    const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
+    return prop.textureAlignment;
+  }
+
+  void* getAlignedMemory(void* addr, size_t alignment) {
+    return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
+  }
+
+  mutable std::mutex mutex_;
+  size_t max_ctx_mem_size_{0};
+  std::unordered_map<PredictorID, AllocationPtr> context_memorys_;
  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };


--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
    std::vector<std::string> output_maps =
        Attr<std::vector<std::string>>("output_name_mapping");

-    int num_inputs = 0;
-
-    num_inputs += runtime_input_names_.size();
-    //  const int num_bindings = num_inputs + Outputs("Ys").size();
-    //  std::vector<void *> buffers(num_bindings);
-    // This method returns the total over all profiles.
+    // Get the total over all profiles
    const int num_bindings = engine->GetNbBindings();
    std::vector<void *> buffers(num_bindings, nullptr);