未验证 提交 ea5ca555 编写于 作者: Y Yuanle Liu 提交者: GitHub

[Paddle-TRT] Support engine sharing memory of multiple predictors (#47631)

上级 d969c309
...@@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine(
bool use_calib_mode) { bool use_calib_mode) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
return; return;
} }
use_tensorrt_ = true; use_tensorrt_ = true;
#ifdef PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_ = true;
#else
LOG(WARNING)
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
trt_engine_memory_sharing_ = false;
#endif
#endif
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
...@@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine(
#endif #endif
} }
void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
int sharing_identifier) {
PADDLE_ENFORCE_EQ(
use_tensorrt_,
true,
platform::errors::InvalidArgument(
"To enable TensorRT memory optim, please call "
"EnableTensorRtEngine or enable_tensorrt_engine first."));
PADDLE_ENFORCE_GE(sharing_identifier,
0,
platform::errors::InvalidArgument(
"The value of sharing_identifier must be greater "
"than or equal to 0."));
if (!engine_memory_sharing) {
PADDLE_ENFORCE_EQ(sharing_identifier,
0,
platform::errors::InvalidArgument(
"The value of sharing_identifier must be equal to 0 "
"when engine_memory_sharing is false."));
}
trt_engine_memory_sharing_ = engine_memory_sharing;
trt_engine_memory_sharing_identifier_ = sharing_identifier;
}
void AnalysisConfig::EnableDlnne( void AnalysisConfig::EnableDlnne(
int min_subgraph_size, int min_subgraph_size,
int max_batch_size, int max_batch_size,
......
...@@ -103,8 +103,13 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -103,8 +103,13 @@ class AnalysisPredictor : public PaddlePredictor {
if (config_.shape_range_info_collected()) { if (config_.shape_range_info_collected()) {
config_.SwitchIrOptim(false); config_.SwitchIrOptim(false);
} }
auto trt_identifier = config_.trt_engine_memory_sharing_identifier_;
if (trt_identifier > 0) {
predictor_id_ = -trt_identifier;
} else {
predictor_id_ = inference::GetUniqueId(); predictor_id_ = inference::GetUniqueId();
} }
}
/// ///
/// \brief Destroy the Analysis Predictor object /// \brief Destroy the Analysis Predictor object
/// ///
......
...@@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/// ///
/// \brief Turn on the TensorRT memory optimization.
///
/// \param engine_memory_sharing Whether to enable TensorRT memory
/// optimization.
/// \param sharing_identifier This parameter can be set if TensorRT memory
/// optimization is enabled, and the value must be greater than 0. If you have
/// multiple predictors that want to share memory, you can specify a
/// same value for these predictors. NOTE: The predictors specified with the
/// same value must be guaranteed to be executed serially, otherwise undefined
/// behavior will occur.
///
void EnableTensorRTMemoryOptim(bool engine_memory_sharing = true,
int sharing_identifier = 0);
///
/// \brief A boolean state telling whether the tensorrt engine memory sharing /// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated. /// is activated.
/// ///
...@@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
bool trt_engine_memory_sharing_{false}; bool trt_engine_memory_sharing_{false};
int trt_engine_memory_sharing_identifier_{0};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_infer_contrib.h" #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h"
...@@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) { ...@@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) {
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = false, py::arg("use_static") = false,
py::arg("use_calib_mode") = true) py::arg("use_calib_mode") = true)
.def("enable_tensorrt_memory_optim",
&AnalysisConfig::EnableTensorRTMemoryOptim,
py::arg("engine_memory_sharing") = true,
py::arg("sharing_identifier") = 0)
.def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode) .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
.def("set_trt_dynamic_shape_info", .def("set_trt_dynamic_shape_info",
&AnalysisConfig::SetTRTDynamicShapeInfo, &AnalysisConfig::SetTRTDynamicShapeInfo,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册