未验证 提交 173b39bb 编写于 作者: Y Yuanle Liu 提交者: GitHub

TensorRT engine context memory sharing (#45842)

上级 d772166c
...@@ -314,6 +314,7 @@ struct Argument { ...@@ -314,6 +314,7 @@ struct Argument {
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(trt_engine_memory_sharing, TrtEngineMemorySharing, bool);
// Indicate which kind of sort algorithm is used for operators, the memory // Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm. // optimization relays on the sort algorithm.
......
...@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_calib_mode", new bool(use_calib_mode)); pass->Set("use_calib_mode", new bool(use_calib_mode));
pass->Set("precision_mode", pass->Set("precision_mode",
new AnalysisConfig::Precision(precision_mode)); new AnalysisConfig::Precision(precision_mode));
pass->Set("context_memory_sharing",
new bool(argument->trt_engine_memory_sharing()));
bool use_static_engine = argument->tensorrt_use_static_engine(); bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory(); bool model_from_memory = argument->model_from_memory();
std::string optim_cache_dir = argument->optim_cache_dir(); std::string optim_cache_dir = argument->optim_cache_dir();
......
...@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in // those parameter already exist in trt, and should not have another copy in
// fluid. // fluid.
std::vector<std::string> repetitive_params; std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) { for (auto *node : graph->Nodes()) {
if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params); CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove( std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(), framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end()); framework::ir::Agent(node).subgraph()->end());
...@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine->SetWithErnie( trt_engine->SetWithErnie(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) && graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass)); graph->Has(framework::ir::kMultiheadMatmulPass));
trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));
if (use_static_engine) { if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData( trt_engine_serialized_data = GetTrtEngineSerializedData(
......
...@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(collect_shape_range_info_); CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_); CP_MEMBER(shape_range_info_path_);
CP_MEMBER(trt_use_inspector_); CP_MEMBER(trt_use_inspector_);
CP_MEMBER(trt_engine_memory_sharing_);
// Dlnne related // Dlnne related
CP_MEMBER(use_dlnne_); CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_); CP_MEMBER(dlnne_min_subgraph_size_);
...@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
} }
use_tensorrt_ = true; use_tensorrt_ = true;
#if PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_ = true;
#else
LOG(WARNING)
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
trt_engine_memory_sharing_ = false;
#endif
#endif
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
...@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; } ...@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
// TODO(Superjomn) refactor this, buggy. // TODO(Superjomn) refactor this, buggy.
void AnalysisConfig::Update() { void AnalysisConfig::Update() {
auto info = SerializeInfoCache(); auto &&info = SerializeInfoCache();
if (info == serialized_info_cache_) return; if (info == serialized_info_cache_) return;
// Transfer pass_builder and copy the existing compatible passes. // Transfer pass_builder and copy the existing compatible passes.
...@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << trt_dla_core_; ss << trt_dla_core_;
ss << enable_memory_optim_; ss << enable_memory_optim_;
ss << trt_engine_memory_sharing_;
ss << use_mkldnn_; ss << use_mkldnn_;
ss << mkldnn_cache_capacity_; ss << mkldnn_cache_capacity_;
...@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const { ...@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_; return enable_memory_optim_;
} }
bool AnalysisConfig::trt_engine_memory_sharing() const {
return trt_engine_memory_sharing_;
}
void AnalysisConfig::SetModelBuffer(const char *prog_buffer, void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
const char *param_buffer, const char *param_buffer,
...@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() { ...@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
if (trt_use_dla_) { if (trt_use_dla_) {
os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)}); os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
} }
os.InsertRow({"trt_engine_memory_sharing",
trt_engine_memory_sharing_ ? "true" : "false"});
#endif #endif
} }
} }
...@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo( ...@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
shape_range_info_path_ = shape_range_info_path; shape_range_info_path_ = shape_range_info_path;
} }
const std::string &AnalysisConfig::shape_range_info_path() { const std::string &AnalysisConfig::shape_range_info_path() const {
return shape_range_info_path_; return shape_range_info_path_;
} }
bool AnalysisConfig::shape_range_info_collected() { bool AnalysisConfig::shape_range_info_collected() const {
return collect_shape_range_info_; return collect_shape_range_info_;
} }
...@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape( ...@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
trt_tuned_dynamic_shape_ = true; trt_tuned_dynamic_shape_ = true;
} }
bool AnalysisConfig::tuned_tensorrt_dynamic_shape() { bool AnalysisConfig::tuned_tensorrt_dynamic_shape() const {
return trt_tuned_dynamic_shape_; return trt_tuned_dynamic_shape_;
} }
bool AnalysisConfig::trt_allow_build_at_runtime() { bool AnalysisConfig::trt_allow_build_at_runtime() const {
return trt_allow_build_at_runtime_; return trt_allow_build_at_runtime_;
} }
......
...@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetTensorRtAllowBuildAtRuntime( argument_.SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime()); config_.trt_allow_build_at_runtime());
argument_.SetTensorRtUseInspector(config_.trt_use_inspector_); argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
argument_.SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
} }
if (config_.dlnne_enabled()) { if (config_.dlnne_enabled()) {
...@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() { ...@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
memory::Release(place_); memory::Release(place_);
} }
device_contexts_.clear(); device_contexts_.clear();
#ifdef PADDLE_WITH_TENSORRT
if (config_.trt_engine_memory_sharing()) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.releaseContextMemory(predictor_id_);
}
#endif
} }
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) { std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
......
...@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/// ///
/// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated.
///
/// \return bool Whether the tensorrt engine memory sharing is activated.
///
bool trt_engine_memory_sharing() const;
///
/// \brief Get the TensorRT engine precision. /// \brief Get the TensorRT engine precision.
/// ///
/// \return Precision Get the TensorRT engine precision. /// \return Precision Get the TensorRT engine precision.
...@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief A boolean state telling whether to use tuned tensorrt dynamic /// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// shape. /// shape.
/// ///
bool tuned_tensorrt_dynamic_shape(); bool tuned_tensorrt_dynamic_shape() const;
/// ///
/// \brief A boolean state telling whether to allow building trt engine at /// \brief A boolean state telling whether to allow building trt engine at
/// runtime. /// runtime.
/// ///
bool trt_allow_build_at_runtime(); bool trt_allow_build_at_runtime() const;
/// ///
/// \brief Set execution stream. If not set a stream will be created /// \brief Set execution stream. If not set a stream will be created
...@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
/// \return the shape info path. /// \return the shape info path.
/// ///
const std::string& shape_range_info_path(); const std::string& shape_range_info_path() const;
/// ///
/// \brief A boolean state telling whether to collect shape info. /// \brief A boolean state telling whether to collect shape info.
/// ///
/// \return bool Whether to collect shape info. /// \return bool Whether to collect shape info.
/// ///
bool shape_range_info_collected(); bool shape_range_info_collected() const;
/// ///
/// \brief Prevent ops running in Paddle-TRT /// \brief Prevent ops running in Paddle-TRT
...@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
bool trt_engine_memory_sharing_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
...@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() { ...@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
optim_profiles_[i] = infer_builder_->createOptimizationProfile(); optim_profiles_[i] = infer_builder_->createOptimizationProfile();
} }
nvinfer1::IExecutionContext *TensorRTEngine::context() {
std::unique_lock<std::mutex> lock(mutex_);
if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
nvinfer1::IExecutionContext *infer_context{nullptr};
if (context_memory_sharing_) {
infer_context =
infer_engine_->createExecutionContextWithoutDeviceMemory();
} else {
infer_context = infer_engine_->createExecutionContext();
}
PADDLE_ENFORCE_NOT_NULL(
infer_context,
platform::errors::InvalidArgument(
"TensorRT engine can not build execution context."));
if (with_dynamic_shape_) {
// need new profile if it's not the first
if (cur_profile_num_ > 0) {
infer_context->setOptimizationProfile(cur_profile_num_);
}
profile_index_[predictor_id_per_thread] = cur_profile_num_;
++cur_profile_num_;
}
infer_context_[predictor_id_per_thread].reset(infer_context);
}
return infer_context_[predictor_id_per_thread].get();
}
void TensorRTEngine::Execute(int batch_size, void TensorRTEngine::Execute(int batch_size,
std::vector<void *> *buffers, std::vector<void *> *buffers,
cudaStream_t stream) { cudaStream_t stream) {
freshDeviceId(); freshDeviceId();
auto infer_context = context(); auto infer_context = context();
if (context_memory_sharing_) {
void *context_memory{nullptr};
context_memory =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.getContextMemory(
predictor_id_per_thread,
phi::GPUPlace(device_id_),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory);
}
if (!with_dynamic_shape()) { if (!with_dynamic_shape()) {
infer_context->enqueue(batch_size, buffers->data(), stream, nullptr); infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
} else { } else {
...@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
infer_context_.clear(); infer_context_.clear();
cur_profile_num_ = 0; cur_profile_num_ = 0;
} }
// for engine context memory sharing
if (context_memory_sharing_) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread);
}
GetEngineInfo(); GetEngineInfo();
} }
...@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *> ...@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
return &itensor_map_; return &itensor_map_;
} }
void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
if (use_dla_) {
if (precision_ != AnalysisConfig::Precision::kInt8 &&
precision_ != AnalysisConfig::Precision::kHalf) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used.";
} else if (runtime->getNbDLACores() == 0) {
LOG(WARNING)
<< "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used.";
} else {
if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
dla_core_ = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< runtime->getNbDLACores() << ", but got " << dla_core_
<< ", so use use 0 as default.";
}
runtime->setDLACore(dla_core_);
LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
<< dla_core_;
}
}
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size()));
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
binding_num_ = infer_engine_->getNbBindings();
// for engine context memory sharing
if (context_memory_sharing_) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread);
}
GetEngineInfo();
}
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size; runtime_batch_ = batch_size;
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <NvInfer.h> #include <NvInfer.h>
#include <cstdint>
#include <map> #include <map>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
...@@ -37,6 +38,8 @@ limitations under the License. */ ...@@ -37,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h" #include "paddle/utils/any.h"
namespace paddle { namespace paddle {
...@@ -171,7 +174,7 @@ class TRTInt8Calibrator; ...@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
/* /*
* TensorRT Engine. * TensorRT Engine.
* *
* There are two alternative ways to use it, one is to build from a paddle * There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manually construct the network. * protobuf model, another way is to manually construct the network.
*/ */
class TensorRTEngine { class TensorRTEngine {
...@@ -287,51 +290,10 @@ class TensorRTEngine { ...@@ -287,51 +290,10 @@ class TensorRTEngine {
std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap(); std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::IExecutionContext* context() { nvinfer1::IExecutionContext* context();
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_);
if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
infer_context_[predictor_id_per_thread].reset(
infer_engine_->createExecutionContext());
if (with_dynamic_shape_) {
// need new profile if it's not the first
if (cur_profile_num_ > 0) {
infer_context_[predictor_id_per_thread]->setOptimizationProfile(
cur_profile_num_);
}
profile_index_[predictor_id_per_thread] = cur_profile_num_;
++cur_profile_num_;
}
}
return infer_context_[predictor_id_per_thread].get();
}
int GetProfileIndex() { int GetProfileIndex() {
if (max_profile_num_ > 1) { if (max_profile_num_ > 1) {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
return profile_index_[predictor_id_per_thread]; return profile_index_[predictor_id_per_thread];
} else { } else {
...@@ -350,15 +312,6 @@ class TensorRTEngine { ...@@ -350,15 +312,6 @@ class TensorRTEngine {
infer_engine_, infer_engine_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"You should build engine first and then set the context.")); "You should build engine first and then set the context."));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
infer_context_[predictor_id_per_thread].reset(nullptr); infer_context_[predictor_id_per_thread].reset(nullptr);
infer_context_.erase(predictor_id_per_thread); infer_context_.erase(predictor_id_per_thread);
...@@ -380,47 +333,7 @@ class TensorRTEngine { ...@@ -380,47 +333,7 @@ class TensorRTEngine {
return ihost_memory_.get(); return ihost_memory_.get();
} }
void Deserialize(const std::string& engine_serialized_data) { void Deserialize(const std::string& engine_serialized_data);
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
if (use_dla_) {
if (precision_ != AnalysisConfig::Precision::kInt8 &&
precision_ != AnalysisConfig::Precision::kHalf) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used.";
} else if (runtime->getNbDLACores() == 0) {
LOG(WARNING)
<< "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used.";
} else {
if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
dla_core_ = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< runtime->getNbDLACores() << ", but got " << dla_core_
<< ", so use use 0 as default.";
}
runtime->setDLACore(dla_core_);
LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
<< dla_core_;
}
}
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size()));
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
binding_num_ = infer_engine_->getNbBindings();
GetEngineInfo();
}
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch(); int GetRuntimeBatch();
...@@ -694,6 +607,10 @@ class TensorRTEngine { ...@@ -694,6 +607,10 @@ class TensorRTEngine {
void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; } void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
void SetScope(const framework::Scope& scope) { scope_ = &scope; } void SetScope(const framework::Scope& scope) { scope_ = &scope; }
void SetContextMemorySharing(bool context_memory_sharing) {
context_memory_sharing_ = context_memory_sharing;
}
private: private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated, // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling // ensure that the thread is associated with the correct device by calling
...@@ -714,6 +631,9 @@ class TensorRTEngine { ...@@ -714,6 +631,9 @@ class TensorRTEngine {
// batch size of the current data, will be updated each Executation. // batch size of the current data, will be updated each Executation.
int batch_size_{-1}; int batch_size_{-1};
// use for engine context memory sharing
bool context_memory_sharing_{false};
int device_id_; int device_id_;
int max_profile_num_{1}; int max_profile_num_{1};
int cur_profile_num_{0}; int cur_profile_num_{0};
...@@ -791,14 +711,23 @@ class TensorRTEngine { ...@@ -791,14 +711,23 @@ class TensorRTEngine {
engine__->network()->add##layer__(__VA_ARGS__) engine__->network()->add##layer__(__VA_ARGS__)
class TRTEngineManager { class TRTEngineManager {
using PredictorID = int;
using AllocationPtr = phi::Allocator::AllocationPtr;
public: public:
bool Empty() const { return engines_.size() == 0; } bool Empty() const {
std::lock_guard<std::mutex> lock(mutex_);
return engines_.size() == 0;
}
bool Has(const std::string& name) const { bool Has(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
if (engines_.count(name) == 0) return false; if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr; return engines_.at(name).get() != nullptr;
} }
TensorRTEngine* Get(const std::string& name) const { TensorRTEngine* Get(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
return engines_.at(name).get(); return engines_.at(name).get();
} }
...@@ -826,17 +755,21 @@ class TRTEngineManager { ...@@ -826,17 +755,21 @@ class TRTEngineManager {
disable_trt_plugin_fp16, disable_trt_plugin_fp16,
model_precision, model_precision,
logger); logger);
std::lock_guard<std::mutex> lock(mutex_);
engines_[name].reset(p); engines_[name].reset(p);
return p; return p;
} }
void DeleteAll() { void DeleteAll() {
std::lock_guard<std::mutex> lock(mutex_);
for (auto& item : engines_) { for (auto& item : engines_) {
item.second.reset(nullptr); item.second.reset(nullptr);
} }
engines_.clear();
} }
void DeleteKey(const std::string& key) { void DeleteKey(const std::string& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = engines_.find(key); auto iter = engines_.find(key);
if (iter != engines_.end()) { if (iter != engines_.end()) {
iter->second.reset(nullptr); iter->second.reset(nullptr);
...@@ -844,7 +777,57 @@ class TRTEngineManager { ...@@ -844,7 +777,57 @@ class TRTEngineManager {
} }
} }
void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
bool size_updated{false};
{
std::lock_guard<std::mutex> lock(mutex_);
if (max_ctx_mem_size_ < mem_size) {
max_ctx_mem_size_ = mem_size;
size_updated = true;
}
}
if (size_updated) {
releaseContextMemory(predictor_id);
}
}
void* getContextMemory(PredictorID predictor_id,
const phi::GPUPlace& place,
const phi::Stream& stream) {
std::lock_guard<std::mutex> lock(mutex_);
static auto alignment = getAlignmentSize(place);
if (context_memorys_.count(predictor_id) == 0) {
auto context_memory =
memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
// context_memory_[predictor_id].reset(context_memory.release());
context_memorys_[predictor_id] = std::move(context_memory);
}
return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
}
void releaseContextMemory(PredictorID predictor_id) {
std::lock_guard<std::mutex> lock(mutex_);
if (context_memorys_.count(predictor_id)) {
context_memorys_[predictor_id].reset(nullptr);
context_memorys_.erase(predictor_id);
}
}
private: private:
size_t getAlignmentSize(const phi::GPUPlace& place) {
const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
return prop.textureAlignment;
}
void* getAlignedMemory(void* addr, size_t alignment) {
return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
}
mutable std::mutex mutex_;
size_t max_ctx_mem_size_{0};
std::unordered_map<PredictorID, AllocationPtr> context_memorys_;
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_; std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
}; };
......
...@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
int num_inputs = 0; // Get the total over all profiles
num_inputs += runtime_input_names_.size();
// const int num_bindings = num_inputs + Outputs("Ys").size();
// std::vector<void *> buffers(num_bindings);
// This method returns the total over all profiles.
const int num_bindings = engine->GetNbBindings(); const int num_bindings = engine->GetNbBindings();
std::vector<void *> buffers(num_bindings, nullptr); std::vector<void *> buffers(num_bindings, nullptr);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册