未验证 提交 173b39bb 编写于 作者: Y Yuanle Liu 提交者: GitHub

TensorRT engine context memory sharing (#45842)

上级 d772166c
......@@ -314,6 +314,7 @@ struct Argument {
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(trt_engine_memory_sharing, TrtEngineMemorySharing, bool);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
......
......@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_calib_mode", new bool(use_calib_mode));
pass->Set("precision_mode",
new AnalysisConfig::Precision(precision_mode));
pass->Set("context_memory_sharing",
new bool(argument->trt_engine_memory_sharing()));
bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory();
std::string optim_cache_dir = argument->optim_cache_dir();
......
......@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
......@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine->SetWithErnie(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
trt_engine->SetContextMemorySharing(Get<bool>("context_memory_sharing"));
if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
......
......@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_);
CP_MEMBER(trt_use_inspector_);
CP_MEMBER(trt_engine_memory_sharing_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
......@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
}
use_tensorrt_ = true;
#if PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_ = true;
#else
LOG(WARNING)
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
trt_engine_memory_sharing_ = false;
#endif
#endif
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
......@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
// TODO(Superjomn) refactor this, buggy.
void AnalysisConfig::Update() {
auto info = SerializeInfoCache();
auto &&info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
// Transfer pass_builder and copy the existing compatible passes.
......@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << trt_dla_core_;
ss << enable_memory_optim_;
ss << trt_engine_memory_sharing_;
ss << use_mkldnn_;
ss << mkldnn_cache_capacity_;
......@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_;
}
bool AnalysisConfig::trt_engine_memory_sharing() const {
return trt_engine_memory_sharing_;
}
void AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
......@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
if (trt_use_dla_) {
os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
}
os.InsertRow({"trt_engine_memory_sharing",
trt_engine_memory_sharing_ ? "true" : "false"});
#endif
}
}
......@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
shape_range_info_path_ = shape_range_info_path;
}
const std::string &AnalysisConfig::shape_range_info_path() {
const std::string &AnalysisConfig::shape_range_info_path() const {
return shape_range_info_path_;
}
bool AnalysisConfig::shape_range_info_collected() {
bool AnalysisConfig::shape_range_info_collected() const {
return collect_shape_range_info_;
}
......@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
trt_tuned_dynamic_shape_ = true;
}
bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
bool AnalysisConfig::tuned_tensorrt_dynamic_shape() const {
return trt_tuned_dynamic_shape_;
}
bool AnalysisConfig::trt_allow_build_at_runtime() {
bool AnalysisConfig::trt_allow_build_at_runtime() const {
return trt_allow_build_at_runtime_;
}
......
......@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime());
argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
argument_.SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
}
if (config_.dlnne_enabled()) {
......@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
memory::Release(place_);
}
device_contexts_.clear();
#ifdef PADDLE_WITH_TENSORRT
if (config_.trt_engine_memory_sharing()) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.releaseContextMemory(predictor_id_);
}
#endif
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
......
......@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
///
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
///
/// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated.
///
/// \return bool Whether the tensorrt engine memory sharing is activated.
///
bool trt_engine_memory_sharing() const;
///
/// \brief Get the TensorRT engine precision.
///
/// \return Precision Get the TensorRT engine precision.
......@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// shape.
///
bool tuned_tensorrt_dynamic_shape();
bool tuned_tensorrt_dynamic_shape() const;
///
/// \brief A boolean state telling whether to allow building trt engine at
/// runtime.
///
bool trt_allow_build_at_runtime();
bool trt_allow_build_at_runtime() const;
///
/// \brief Set execution stream. If not set a stream will be created
......@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
///
/// \return the shape info path.
///
const std::string& shape_range_info_path();
const std::string& shape_range_info_path() const;
///
/// \brief A boolean state telling whether to collect shape info.
///
/// \return bool Whether to collect shape info.
///
bool shape_range_info_collected();
bool shape_range_info_collected() const;
///
/// \brief Prevent ops running in Paddle-TRT
......@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related.
bool enable_memory_optim_{false};
bool trt_engine_memory_sharing_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
......@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
optim_profiles_[i] = infer_builder_->createOptimizationProfile();
}
nvinfer1::IExecutionContext *TensorRTEngine::context() {
std::unique_lock<std::mutex> lock(mutex_);
if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
nvinfer1::IExecutionContext *infer_context{nullptr};
if (context_memory_sharing_) {
infer_context =
infer_engine_->createExecutionContextWithoutDeviceMemory();
} else {
infer_context = infer_engine_->createExecutionContext();
}
PADDLE_ENFORCE_NOT_NULL(
infer_context,
platform::errors::InvalidArgument(
"TensorRT engine can not build execution context."));
if (with_dynamic_shape_) {
// need new profile if it's not the first
if (cur_profile_num_ > 0) {
infer_context->setOptimizationProfile(cur_profile_num_);
}
profile_index_[predictor_id_per_thread] = cur_profile_num_;
++cur_profile_num_;
}
infer_context_[predictor_id_per_thread].reset(infer_context);
}
return infer_context_[predictor_id_per_thread].get();
}
void TensorRTEngine::Execute(int batch_size,
std::vector<void *> *buffers,
cudaStream_t stream) {
freshDeviceId();
auto infer_context = context();
if (context_memory_sharing_) {
void *context_memory{nullptr};
context_memory =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.getContextMemory(
predictor_id_per_thread,
phi::GPUPlace(device_id_),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory);
}
if (!with_dynamic_shape()) {
infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
} else {
......@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
infer_context_.clear();
cur_profile_num_ = 0;
}
// for engine context memory sharing
if (context_memory_sharing_) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread);
}
GetEngineInfo();
}
......@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
return &itensor_map_;
}
void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) {
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
if (use_dla_) {
if (precision_ != AnalysisConfig::Precision::kInt8 &&
precision_ != AnalysisConfig::Precision::kHalf) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used.";
} else if (runtime->getNbDLACores() == 0) {
LOG(WARNING)
<< "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used.";
} else {
if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
dla_core_ = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< runtime->getNbDLACores() << ", but got " << dla_core_
<< ", so use use 0 as default.";
}
runtime->setDLACore(dla_core_);
LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
<< dla_core_;
}
}
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size()));
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
binding_num_ = infer_engine_->getNbBindings();
// for engine context memory sharing
if (context_memory_sharing_) {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.updateContextMemorySize(infer_engine_->getDeviceMemorySize(),
predictor_id_per_thread);
}
GetEngineInfo();
}
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <NvInfer.h>
#include <cstdint>
#include <map>
#include <memory>
#include <mutex> // NOLINT
......@@ -37,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h"
namespace paddle {
......@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
/*
* TensorRT Engine.
*
* There are two alternative ways to use it, one is to build from a paddle
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manually construct the network.
*/
class TensorRTEngine {
......@@ -287,51 +290,10 @@ class TensorRTEngine {
std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::IExecutionContext* context() {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_);
if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
infer_context_[predictor_id_per_thread].reset(
infer_engine_->createExecutionContext());
if (with_dynamic_shape_) {
// need new profile if it's not the first
if (cur_profile_num_ > 0) {
infer_context_[predictor_id_per_thread]->setOptimizationProfile(
cur_profile_num_);
}
profile_index_[predictor_id_per_thread] = cur_profile_num_;
++cur_profile_num_;
}
}
return infer_context_[predictor_id_per_thread].get();
}
nvinfer1::IExecutionContext* context();
int GetProfileIndex() {
if (max_profile_num_ > 1) {
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_);
return profile_index_[predictor_id_per_thread];
} else {
......@@ -350,15 +312,6 @@ class TensorRTEngine {
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT(
predictor_id_per_thread,
-1,
platform::errors::InvalidArgument(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d",
predictor_id_per_thread));
#endif
std::unique_lock<std::mutex> lock(mutex_);
infer_context_[predictor_id_per_thread].reset(nullptr);
infer_context_.erase(predictor_id_per_thread);
......@@ -380,47 +333,7 @@ class TensorRTEngine {
return ihost_memory_.get();
}
void Deserialize(const std::string& engine_serialized_data) {
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
if (use_dla_) {
if (precision_ != AnalysisConfig::Precision::kInt8 &&
precision_ != AnalysisConfig::Precision::kHalf) {
LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used.";
} else if (runtime->getNbDLACores() == 0) {
LOG(WARNING)
<< "TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used.";
} else {
if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) {
dla_core_ = 0;
LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
<< runtime->getNbDLACores() << ", but got " << dla_core_
<< ", so use use 0 as default.";
}
runtime->setDLACore(dla_core_);
LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore "
<< dla_core_;
}
}
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size()));
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::Fatal(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:\n1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;\n2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."));
binding_num_ = infer_engine_->getNbBindings();
GetEngineInfo();
}
void Deserialize(const std::string& engine_serialized_data);
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
......@@ -694,6 +607,10 @@ class TensorRTEngine {
void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
void SetScope(const framework::Scope& scope) { scope_ = &scope; }
void SetContextMemorySharing(bool context_memory_sharing) {
context_memory_sharing_ = context_memory_sharing;
}
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
......@@ -714,6 +631,9 @@ class TensorRTEngine {
// batch size of the current data, will be updated each Executation.
int batch_size_{-1};
// use for engine context memory sharing
bool context_memory_sharing_{false};
int device_id_;
int max_profile_num_{1};
int cur_profile_num_{0};
......@@ -791,14 +711,23 @@ class TensorRTEngine {
engine__->network()->add##layer__(__VA_ARGS__)
class TRTEngineManager {
using PredictorID = int;
using AllocationPtr = phi::Allocator::AllocationPtr;
public:
bool Empty() const { return engines_.size() == 0; }
bool Empty() const {
std::lock_guard<std::mutex> lock(mutex_);
return engines_.size() == 0;
}
bool Has(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr;
}
TensorRTEngine* Get(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
return engines_.at(name).get();
}
......@@ -826,17 +755,21 @@ class TRTEngineManager {
disable_trt_plugin_fp16,
model_precision,
logger);
std::lock_guard<std::mutex> lock(mutex_);
engines_[name].reset(p);
return p;
}
void DeleteAll() {
std::lock_guard<std::mutex> lock(mutex_);
for (auto& item : engines_) {
item.second.reset(nullptr);
}
engines_.clear();
}
void DeleteKey(const std::string& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto iter = engines_.find(key);
if (iter != engines_.end()) {
iter->second.reset(nullptr);
......@@ -844,7 +777,57 @@ class TRTEngineManager {
}
}
void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) {
bool size_updated{false};
{
std::lock_guard<std::mutex> lock(mutex_);
if (max_ctx_mem_size_ < mem_size) {
max_ctx_mem_size_ = mem_size;
size_updated = true;
}
}
if (size_updated) {
releaseContextMemory(predictor_id);
}
}
void* getContextMemory(PredictorID predictor_id,
const phi::GPUPlace& place,
const phi::Stream& stream) {
std::lock_guard<std::mutex> lock(mutex_);
static auto alignment = getAlignmentSize(place);
if (context_memorys_.count(predictor_id) == 0) {
auto context_memory =
memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
// context_memory_[predictor_id].reset(context_memory.release());
context_memorys_[predictor_id] = std::move(context_memory);
}
return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
}
void releaseContextMemory(PredictorID predictor_id) {
std::lock_guard<std::mutex> lock(mutex_);
if (context_memorys_.count(predictor_id)) {
context_memorys_[predictor_id].reset(nullptr);
context_memorys_.erase(predictor_id);
}
}
private:
size_t getAlignmentSize(const phi::GPUPlace& place) {
const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
return prop.textureAlignment;
}
void* getAlignedMemory(void* addr, size_t alignment) {
return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
}
mutable std::mutex mutex_;
size_t max_ctx_mem_size_{0};
std::unordered_map<PredictorID, AllocationPtr> context_memorys_;
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
};
......
......@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping");
int num_inputs = 0;
num_inputs += runtime_input_names_.size();
// const int num_bindings = num_inputs + Outputs("Ys").size();
// std::vector<void *> buffers(num_bindings);
// This method returns the total over all profiles.
// Get the total over all profiles
const int num_bindings = engine->GetNbBindings();
std::vector<void *> buffers(num_bindings, nullptr);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册