From ea5ca5559bdc6e2e428e3544e28b24c60986572a Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Fri, 2 Dec 2022 20:46:39 +0800 Subject: [PATCH] [Paddle-TRT] Support engine sharing memory of multiple predictors (#47631) --- paddle/fluid/inference/api/analysis_config.cc | 39 ++++++++++++------- .../fluid/inference/api/analysis_predictor.h | 7 +++- .../inference/api/paddle_analysis_config.h | 15 +++++++ paddle/fluid/pybind/inference_api.cc | 5 +++ 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 22b8e4487d1..7720fab31e2 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine( bool use_calib_mode) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!use_gpu()) { - LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; + LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first"; return; } use_tensorrt_ = true; -#ifdef PADDLE_WITH_TENSORRT - // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2 - // when trt version less than 7.2, - // createExecutionContextWithoutDeviceMemory() has bug. - // so, we cannot enable engine context memory sharing. -#if IS_TRT_VERSION_GE(7200) - trt_engine_memory_sharing_ = true; -#else - LOG(WARNING) - << "TensorRT engine context memory sharing needs version 7.2 and after."; - trt_engine_memory_sharing_ = false; -#endif -#endif tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; @@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine( #endif } +void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing, + int sharing_identifier) { + PADDLE_ENFORCE_EQ( + use_tensorrt_, + true, + platform::errors::InvalidArgument( + "To enable TensorRT memory optim, please call " + "EnableTensorRtEngine or enable_tensorrt_engine first.")); + PADDLE_ENFORCE_GE(sharing_identifier, + 0, + platform::errors::InvalidArgument( + "The value of sharing_identifier must be greater " + "than or equal to 0.")); + if (!engine_memory_sharing) { + PADDLE_ENFORCE_EQ(sharing_identifier, + 0, + platform::errors::InvalidArgument( + "The value of sharing_identifier must be equal to 0 " + "when engine_memory_sharing is false.")); + } + trt_engine_memory_sharing_ = engine_memory_sharing; + trt_engine_memory_sharing_identifier_ = sharing_identifier; +} + void AnalysisConfig::EnableDlnne( int min_subgraph_size, int max_batch_size, diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 25595d12cb4..09e1b43377c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -103,7 +103,12 @@ class AnalysisPredictor : public PaddlePredictor { if (config_.shape_range_info_collected()) { config_.SwitchIrOptim(false); } - predictor_id_ = inference::GetUniqueId(); + auto trt_identifier = config_.trt_engine_memory_sharing_identifier_; + if (trt_identifier > 0) { + predictor_id_ = -trt_identifier; + } else { + predictor_id_ = inference::GetUniqueId(); + } } /// /// \brief Destroy the Analysis Predictor object diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 0fef4f6ced5..a8f645680a9 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_engine_enabled() const { return use_tensorrt_; } /// + /// \brief Turn on the TensorRT memory optimization. + /// + /// \param engine_memory_sharing Whether to enable TensorRT memory + /// optimization. + /// \param sharing_identifier This parameter can be set if TensorRT memory + /// optimization is enabled, and the value must be greater than 0. If you have + /// multiple predictors that want to share memory, you can specify a + /// same value for these predictors. NOTE: The predictors specified with the + /// same value must be guaranteed to be executed serially, otherwise undefined + /// behavior will occur. + /// + void EnableTensorRTMemoryOptim(bool engine_memory_sharing = true, + int sharing_identifier = 0); + /// /// \brief A boolean state telling whether the tensorrt engine memory sharing /// is activated. /// @@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig { // memory reuse related. bool enable_memory_optim_{false}; bool trt_engine_memory_sharing_{false}; + int trt_engine_memory_sharing_identifier_{0}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 83db629dc89..1524c1f29d6 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -32,6 +32,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_infer_contrib.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" @@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) { py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("use_static") = false, py::arg("use_calib_mode") = true) + .def("enable_tensorrt_memory_optim", + &AnalysisConfig::EnableTensorRTMemoryOptim, + py::arg("engine_memory_sharing") = true, + py::arg("sharing_identifier") = 0) .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode) .def("set_trt_dynamic_shape_info", &AnalysisConfig::SetTRTDynamicShapeInfo, -- GitLab