diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fa927a7da225f7527297e71b0f4913fb19196fe1..f5a51b7c3bc4e77864451362fe4b3dbf6643da29 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -907,6 +907,15 @@ bool AnalysisPredictor::Run(const std::vector &inputs, return false; } +#ifdef PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled()) { + inference::tensorrt::TensorRTEngine::predictor_id_per_thread = + predictor_id_; + VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: " + << inference::tensorrt::TensorRTEngine::predictor_id_per_thread; + } +#endif + // Run the inference program // if share variables, we need not create variables executor_->Run(); @@ -1630,6 +1639,16 @@ bool AnalysisPredictor::ZeroCopyRun() { MkldnnPreSet(shape_vector); } #endif + +#ifdef PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled()) { + inference::tensorrt::TensorRTEngine::predictor_id_per_thread = + predictor_id_; + VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: " + << inference::tensorrt::TensorRTEngine::predictor_id_per_thread; + } +#endif + executor_->Run(); if (config_.shape_range_info_collected()) { diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 745f4163cfd5d4ec79e453e8d42f422ad65f8979..ebce2508c04325a0eb0c0ff6f1072b7c58c0a62d 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -30,6 +30,9 @@ namespace paddle { namespace inference { namespace tensorrt { +int TensorRTEngine::runtime_batch_ = 1; +thread_local int TensorRTEngine::predictor_id_per_thread = -1; + void TensorRTEngine::Weight::SetDataType(phi::DataType type) { nvinfer1::DataType nv_type = nvinfer1::DataType::kFLOAT; switch (type) { @@ -59,8 +62,6 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) { w_.type = nv_type; } -int TensorRTEngine::runtime_batch_ = 1; - void TensorRTEngine::InitNetwork() { freshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); @@ -680,8 +681,9 @@ void TensorRTEngine::GetEngineInfo() { LOG(INFO) << "====== engine info ======"; std::unique_ptr infer_inspector( infer_engine_->createEngineInspector()); - auto infer_context = context(); - infer_inspector->setExecutionContext(infer_context); + auto infer_context = infer_ptr( + infer_engine_->createExecutionContextWithoutDeviceMemory()); + infer_inspector->setExecutionContext(infer_context.get()); LOG(INFO) << infer_inspector->getEngineInformation( nvinfer1::LayerInformationFormat::kONELINE); LOG(INFO) << "====== engine info end ======"; diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 8aa8c98cdc5e155c4cc498ac3d5f29abccccf6fa..861a2aa8dfbd7d20536eb2ef031316d032ae1437 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -177,6 +177,7 @@ class TRTInt8Calibrator; class TensorRTEngine { using DescType = ::paddle::framework::proto::BlockDesc; using ShapeMapType = std::map>; + using PredictorID = int; public: // Weight is model parameter. @@ -286,9 +287,17 @@ class TensorRTEngine { nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::IExecutionContext* context() { +#ifndef PADDLE_WITH_TESTING + PADDLE_ENFORCE_GT( + predictor_id_per_thread, + -1, + platform::errors::InvalidArgument( + "thread local var predictor_id_per_thread must be " + "initialized to >= 0, but now predictor_id_per_thread = %d", + predictor_id_per_thread)); +#endif std::unique_lock lock(mutex_); - const std::thread::id tid = std::this_thread::get_id(); - if (infer_context_.find(tid) == infer_context_.end()) { + if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) { PADDLE_ENFORCE_NOT_NULL( infer_engine_, platform::errors::InvalidArgument( @@ -296,24 +305,34 @@ class TensorRTEngine { // We may see trt warning: Profile 0 has been chosen by another // IExecutionContext... // It's ok. We will set it later. - infer_context_[tid].reset(infer_engine_->createExecutionContext()); + infer_context_[predictor_id_per_thread].reset( + infer_engine_->createExecutionContext()); if (with_dynamic_shape_) { // need new profile if it's not the first if (cur_profile_num_ > 0) { - infer_context_[tid]->setOptimizationProfile(cur_profile_num_); + infer_context_[predictor_id_per_thread]->setOptimizationProfile( + cur_profile_num_); } - profile_index_[tid] = cur_profile_num_; + profile_index_[predictor_id_per_thread] = cur_profile_num_; ++cur_profile_num_; } } - return infer_context_[tid].get(); + return infer_context_[predictor_id_per_thread].get(); } int GetProfileIndex() { if (max_profile_num_ > 1) { +#ifndef PADDLE_WITH_TESTING + PADDLE_ENFORCE_GT( + predictor_id_per_thread, + -1, + platform::errors::InvalidArgument( + "thread local var predictor_id_per_thread must be " + "initialized to >= 0, but now predictor_id_per_thread = %d", + predictor_id_per_thread)); +#endif std::unique_lock lock(mutex_); - const std::thread::id tid = std::this_thread::get_id(); - return profile_index_[tid]; + return profile_index_[predictor_id_per_thread]; } else { return 0; } @@ -326,14 +345,22 @@ class TensorRTEngine { int GetNbBindings() { return binding_num_; } void ResetContext() { - std::unique_lock lock(mutex_); - const std::thread::id tid = std::this_thread::get_id(); PADDLE_ENFORCE_NOT_NULL( infer_engine_, platform::errors::InvalidArgument( "You should build engine first and then set the context.")); - infer_context_[tid].reset(nullptr); - infer_context_.erase(tid); +#ifndef PADDLE_WITH_TESTING + PADDLE_ENFORCE_GT( + predictor_id_per_thread, + -1, + platform::errors::InvalidArgument( + "thread local var predictor_id_per_thread must be " + "initialized to >= 0, but now predictor_id_per_thread = %d", + predictor_id_per_thread)); +#endif + std::unique_lock lock(mutex_); + infer_context_[predictor_id_per_thread].reset(nullptr); + infer_context_.erase(predictor_id_per_thread); } nvinfer1::IHostMemory* Serialize() { @@ -686,7 +713,7 @@ class TensorRTEngine { int device_id_; int max_profile_num_{1}; int cur_profile_num_{0}; - std::unordered_map profile_index_; + std::unordered_map profile_index_; ShapeMapType min_input_shape_; ShapeMapType max_input_shape_; ShapeMapType optim_input_shape_; @@ -723,7 +750,7 @@ class TensorRTEngine { infer_ptr infer_builder_; infer_ptr infer_network_; infer_ptr infer_engine_; - std::unordered_map> + std::unordered_map> infer_context_; infer_ptr ihost_memory_; std::unordered_map quant_dynamic_range_; @@ -741,6 +768,9 @@ class TensorRTEngine { #endif std::mutex mutex_; bool use_inspector_; + + public: + thread_local static int predictor_id_per_thread; }; // class TensorRTEngine // Add a layer__ into engine__ with args ARGS.