[Paddle-TRT] Support engine sharing memory of multiple predictors (#47631)

ea5ca555 · Yuanle Liu · GitHub · d969c309 · ea5ca555 · ea5ca555
4 changed file
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine(
    bool use_calib_mode) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (!use_gpu()) {
-    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
+    LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
    return;
  }

  use_tensorrt_ = true;
-#ifdef PADDLE_WITH_TENSORRT
-  // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
-  // when trt version less than 7.2,
-  // createExecutionContextWithoutDeviceMemory() has bug.
-  // so, we cannot enable engine context memory sharing.
-#if IS_TRT_VERSION_GE(7200)
-  trt_engine_memory_sharing_ = true;
-#else
-  LOG(WARNING)
-      << "TensorRT engine context memory sharing needs version 7.2 and after.";
-  trt_engine_memory_sharing_ = false;
-#endif
-#endif
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
@@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }

+void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
+                                               int sharing_identifier) {
+  PADDLE_ENFORCE_EQ(
+      use_tensorrt_,
+      true,
+      platform::errors::InvalidArgument(
+          "To enable TensorRT memory optim, please call "
+          "EnableTensorRtEngine or enable_tensorrt_engine first."));
+  PADDLE_ENFORCE_GE(sharing_identifier,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The value of sharing_identifier must be greater "
+                        "than or equal to 0."));
+  if (!engine_memory_sharing) {
+    PADDLE_ENFORCE_EQ(sharing_identifier,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "The value of sharing_identifier must be equal to 0 "
+                          "when engine_memory_sharing is false."));
+  }
+  trt_engine_memory_sharing_ = engine_memory_sharing;
+  trt_engine_memory_sharing_identifier_ = sharing_identifier;
+}
+
 void AnalysisConfig::EnableDlnne(
    int min_subgraph_size,
    int max_batch_size,

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -103,8 +103,13 @@ class AnalysisPredictor : public PaddlePredictor {
    if (config_.shape_range_info_collected()) {
      config_.SwitchIrOptim(false);
    }
+    auto trt_identifier = config_.trt_engine_memory_sharing_identifier_;
+    if (trt_identifier > 0) {
+      predictor_id_ = -trt_identifier;
+    } else {
      predictor_id_ = inference::GetUniqueId();
    }
+  }
  ///
  /// \brief Destroy the Analysis Predictor object
  ///

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
  ///
+  /// \brief Turn on the TensorRT memory optimization.
+  ///
+  /// \param engine_memory_sharing Whether to enable TensorRT memory
+  /// optimization.
+  /// \param sharing_identifier This parameter can be set if TensorRT memory
+  /// optimization is enabled, and the value must be greater than 0. If you have
+  /// multiple predictors that want to share memory, you can specify a
+  /// same value for these predictors. NOTE: The predictors specified with the
+  /// same value must be guaranteed to be executed serially, otherwise undefined
+  /// behavior will occur.
+  ///
+  void EnableTensorRTMemoryOptim(bool engine_memory_sharing = true,
+                                 int sharing_identifier = 0);
+  ///
  /// \brief A boolean state telling whether the tensorrt engine memory sharing
  /// is activated.
  ///
@@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig {
  // memory reuse related.
  bool enable_memory_optim_{false};
  bool trt_engine_memory_sharing_{false};
+  int trt_engine_memory_sharing_identifier_{0};

  bool use_mkldnn_{false};
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -32,6 +32,7 @@

 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
@@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) {
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
           py::arg("use_static") = false,
           py::arg("use_calib_mode") = true)
+      .def("enable_tensorrt_memory_optim",
+           &AnalysisConfig::EnableTensorRTMemoryOptim,
+           py::arg("engine_memory_sharing") = true,
+           py::arg("sharing_identifier") = 0)
      .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
      .def("set_trt_dynamic_shape_info",
           &AnalysisConfig::SetTRTDynamicShapeInfo,