未验证 提交 ea5ca555 编写于 作者: Y Yuanle Liu 提交者: GitHub

[Paddle-TRT] Support engine sharing memory of multiple predictors (#47631)

上级 d969c309
......@@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine(
bool use_calib_mode) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
return;
}
use_tensorrt_ = true;
#ifdef PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_ = true;
#else
LOG(WARNING)
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
trt_engine_memory_sharing_ = false;
#endif
#endif
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
......@@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine(
#endif
}
void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
int sharing_identifier) {
PADDLE_ENFORCE_EQ(
use_tensorrt_,
true,
platform::errors::InvalidArgument(
"To enable TensorRT memory optim, please call "
"EnableTensorRtEngine or enable_tensorrt_engine first."));
PADDLE_ENFORCE_GE(sharing_identifier,
0,
platform::errors::InvalidArgument(
"The value of sharing_identifier must be greater "
"than or equal to 0."));
if (!engine_memory_sharing) {
PADDLE_ENFORCE_EQ(sharing_identifier,
0,
platform::errors::InvalidArgument(
"The value of sharing_identifier must be equal to 0 "
"when engine_memory_sharing is false."));
}
trt_engine_memory_sharing_ = engine_memory_sharing;
trt_engine_memory_sharing_identifier_ = sharing_identifier;
}
void AnalysisConfig::EnableDlnne(
int min_subgraph_size,
int max_batch_size,
......
......@@ -103,8 +103,13 @@ class AnalysisPredictor : public PaddlePredictor {
if (config_.shape_range_info_collected()) {
config_.SwitchIrOptim(false);
}
auto trt_identifier = config_.trt_engine_memory_sharing_identifier_;
if (trt_identifier > 0) {
predictor_id_ = -trt_identifier;
} else {
predictor_id_ = inference::GetUniqueId();
}
}
///
/// \brief Destroy the Analysis Predictor object
///
......
......@@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig {
///
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
///
/// \brief Turn on the TensorRT memory optimization.
///
/// \param engine_memory_sharing Whether to enable TensorRT memory
/// optimization.
/// \param sharing_identifier This parameter can be set if TensorRT memory
/// optimization is enabled, and the value must be greater than 0. If you have
/// multiple predictors that want to share memory, you can specify a
/// same value for these predictors. NOTE: The predictors specified with the
/// same value must be guaranteed to be executed serially, otherwise undefined
/// behavior will occur.
///
void EnableTensorRTMemoryOptim(bool engine_memory_sharing = true,
int sharing_identifier = 0);
///
/// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated.
///
......@@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related.
bool enable_memory_optim_{false};
bool trt_engine_memory_sharing_{false};
int trt_engine_memory_sharing_identifier_{0};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
......@@ -32,6 +32,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
......@@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) {
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = false,
py::arg("use_calib_mode") = true)
.def("enable_tensorrt_memory_optim",
&AnalysisConfig::EnableTensorRTMemoryOptim,
py::arg("engine_memory_sharing") = true,
py::arg("sharing_identifier") = 0)
.def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
.def("set_trt_dynamic_shape_info",
&AnalysisConfig::SetTRTDynamicShapeInfo,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册