未验证 提交 02621079 编写于 作者: Y Yuanle Liu 提交者: GitHub

TensorRT Engine context memory bind with predictor id (#45468)

上级 e10e26e7
...@@ -907,6 +907,15 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -907,6 +907,15 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
#ifdef PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled()) {
inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
predictor_id_;
VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
<< inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
}
#endif
// Run the inference program // Run the inference program
// if share variables, we need not create variables // if share variables, we need not create variables
executor_->Run(); executor_->Run();
...@@ -1630,6 +1639,16 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -1630,6 +1639,16 @@ bool AnalysisPredictor::ZeroCopyRun() {
MkldnnPreSet(shape_vector); MkldnnPreSet(shape_vector);
} }
#endif #endif
#ifdef PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled()) {
inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
predictor_id_;
VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
<< inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
}
#endif
executor_->Run(); executor_->Run();
if (config_.shape_range_info_collected()) { if (config_.shape_range_info_collected()) {
......
...@@ -30,6 +30,9 @@ namespace paddle { ...@@ -30,6 +30,9 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
thread_local int TensorRTEngine::predictor_id_per_thread = -1;
void TensorRTEngine::Weight::SetDataType(phi::DataType type) { void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
nvinfer1::DataType nv_type = nvinfer1::DataType::kFLOAT; nvinfer1::DataType nv_type = nvinfer1::DataType::kFLOAT;
switch (type) { switch (type) {
...@@ -59,8 +62,6 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) { ...@@ -59,8 +62,6 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
w_.type = nv_type; w_.type = nv_type;
} }
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::InitNetwork() { void TensorRTEngine::InitNetwork() {
freshDeviceId(); freshDeviceId();
infer_builder_.reset(createInferBuilder(&logger_)); infer_builder_.reset(createInferBuilder(&logger_));
...@@ -680,8 +681,9 @@ void TensorRTEngine::GetEngineInfo() { ...@@ -680,8 +681,9 @@ void TensorRTEngine::GetEngineInfo() {
LOG(INFO) << "====== engine info ======"; LOG(INFO) << "====== engine info ======";
std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector( std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
infer_engine_->createEngineInspector()); infer_engine_->createEngineInspector());
auto infer_context = context(); auto infer_context = infer_ptr<nvinfer1::IExecutionContext>(
infer_inspector->setExecutionContext(infer_context); infer_engine_->createExecutionContextWithoutDeviceMemory());
infer_inspector->setExecutionContext(infer_context.get());
LOG(INFO) << infer_inspector->getEngineInformation( LOG(INFO) << infer_inspector->getEngineInformation(
nvinfer1::LayerInformationFormat::kONELINE); nvinfer1::LayerInformationFormat::kONELINE);
LOG(INFO) << "====== engine info end ======"; LOG(INFO) << "====== engine info end ======";
......
...@@ -177,6 +177,7 @@ class TRTInt8Calibrator; ...@@ -177,6 +177,7 @@ class TRTInt8Calibrator;
class TensorRTEngine { class TensorRTEngine {
using DescType = ::paddle::framework::proto::BlockDesc; using DescType = ::paddle::framework::proto::BlockDesc;
using ShapeMapType = std::map<std::string, std::vector<int>>; using ShapeMapType = std::map<std::string, std::vector<int>>;
using PredictorID = int;
public: public:
// Weight is model parameter. // Weight is model parameter.
...@@ -286,9 +287,17 @@ class TensorRTEngine { ...@@ -286,9 +287,17 @@ class TensorRTEngine {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::IExecutionContext* context() { nvinfer1::IExecutionContext* context() {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
const std::thread::id tid = std::this_thread::get_id(); if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
if (infer_context_.find(tid) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
infer_engine_, infer_engine_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -296,24 +305,34 @@ class TensorRTEngine { ...@@ -296,24 +305,34 @@ class TensorRTEngine {
// We may see trt warning: Profile 0 has been chosen by another // We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext... // IExecutionContext...
// It's ok. We will set it later. // It's ok. We will set it later.
infer_context_[tid].reset(infer_engine_->createExecutionContext()); infer_context_[predictor_id_per_thread].reset(
infer_engine_->createExecutionContext());
if (with_dynamic_shape_) { if (with_dynamic_shape_) {
// need new profile if it's not the first // need new profile if it's not the first
if (cur_profile_num_ > 0) { if (cur_profile_num_ > 0) {
infer_context_[tid]->setOptimizationProfile(cur_profile_num_); infer_context_[predictor_id_per_thread]->setOptimizationProfile(
cur_profile_num_);
} }
profile_index_[tid] = cur_profile_num_; profile_index_[predictor_id_per_thread] = cur_profile_num_;
++cur_profile_num_; ++cur_profile_num_;
} }
} }
return infer_context_[tid].get(); return infer_context_[predictor_id_per_thread].get();
} }
int GetProfileIndex() { int GetProfileIndex() {
if (max_profile_num_ > 1) { if (max_profile_num_ > 1) {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
const std::thread::id tid = std::this_thread::get_id(); return profile_index_[predictor_id_per_thread];
return profile_index_[tid];
} else { } else {
return 0; return 0;
} }
...@@ -326,14 +345,22 @@ class TensorRTEngine { ...@@ -326,14 +345,22 @@ class TensorRTEngine {
int GetNbBindings() { return binding_num_; } int GetNbBindings() { return binding_num_; }
void ResetContext() { void ResetContext() {
std::unique_lock<std::mutex> lock(mutex_);
const std::thread::id tid = std::this_thread::get_id();
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
infer_engine_, infer_engine_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"You should build engine first and then set the context.")); "You should build engine first and then set the context."));
infer_context_[tid].reset(nullptr); #ifndef PADDLE_WITH_TESTING
infer_context_.erase(tid); PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_);
infer_context_[predictor_id_per_thread].reset(nullptr);
infer_context_.erase(predictor_id_per_thread);
} }
nvinfer1::IHostMemory* Serialize() { nvinfer1::IHostMemory* Serialize() {
...@@ -686,7 +713,7 @@ class TensorRTEngine { ...@@ -686,7 +713,7 @@ class TensorRTEngine {
int device_id_; int device_id_;
int max_profile_num_{1}; int max_profile_num_{1};
int cur_profile_num_{0}; int cur_profile_num_{0};
std::unordered_map<std::thread::id, int> profile_index_; std::unordered_map<PredictorID, int> profile_index_;
ShapeMapType min_input_shape_; ShapeMapType min_input_shape_;
ShapeMapType max_input_shape_; ShapeMapType max_input_shape_;
ShapeMapType optim_input_shape_; ShapeMapType optim_input_shape_;
...@@ -723,7 +750,7 @@ class TensorRTEngine { ...@@ -723,7 +750,7 @@ class TensorRTEngine {
infer_ptr<nvinfer1::IBuilder> infer_builder_; infer_ptr<nvinfer1::IBuilder> infer_builder_;
infer_ptr<nvinfer1::INetworkDefinition> infer_network_; infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
infer_ptr<nvinfer1::ICudaEngine> infer_engine_; infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
std::unordered_map<std::thread::id, infer_ptr<nvinfer1::IExecutionContext>> std::unordered_map<PredictorID, infer_ptr<nvinfer1::IExecutionContext>>
infer_context_; infer_context_;
infer_ptr<nvinfer1::IHostMemory> ihost_memory_; infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_; std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
...@@ -741,6 +768,9 @@ class TensorRTEngine { ...@@ -741,6 +768,9 @@ class TensorRTEngine {
#endif #endif
std::mutex mutex_; std::mutex mutex_;
bool use_inspector_; bool use_inspector_;
public:
thread_local static int predictor_id_per_thread;
}; // class TensorRTEngine }; // class TensorRTEngine
// Add a layer__ into engine__ with args ARGS. // Add a layer__ into engine__ with args ARGS.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册