Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
173b39bb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
173b39bb
编写于
9月 22, 2022
作者:
Y
Yuanle Liu
提交者:
GitHub
9月 22, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
TensorRT engine context memory sharing (#45842)
上级
d772166c
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
227 addition
and
112 deletion
+227
-112
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+1
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+2
-1
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+1
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+26
-5
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+12
-4
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+99
-0
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+77
-94
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+1
-6
未找到文件。
paddle/fluid/inference/analysis/argument.h
浏览文件 @
173b39bb
...
@@ -314,6 +314,7 @@ struct Argument {
...
@@ -314,6 +314,7 @@ struct Argument {
// Memory optimized related.
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
trt_engine_memory_sharing
,
TrtEngineMemorySharing
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
// optimization relays on the sort algorithm.
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
173b39bb
...
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"use_calib_mode"
,
new
bool
(
use_calib_mode
));
pass
->
Set
(
"use_calib_mode"
,
new
bool
(
use_calib_mode
));
pass
->
Set
(
"precision_mode"
,
pass
->
Set
(
"precision_mode"
,
new
AnalysisConfig
::
Precision
(
precision_mode
));
new
AnalysisConfig
::
Precision
(
precision_mode
));
pass
->
Set
(
"context_memory_sharing"
,
new
bool
(
argument
->
trt_engine_memory_sharing
()));
bool
use_static_engine
=
argument
->
tensorrt_use_static_engine
();
bool
use_static_engine
=
argument
->
tensorrt_use_static_engine
();
bool
model_from_memory
=
argument
->
model_from_memory
();
bool
model_from_memory
=
argument
->
model_from_memory
();
std
::
string
optim_cache_dir
=
argument
->
optim_cache_dir
();
std
::
string
optim_cache_dir
=
argument
->
optim_cache_dir
();
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
173b39bb
...
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in
// those parameter already exist in trt, and should not have another copy in
// fluid.
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
empty
())
{
if
(
node
->
IsOp
()
&&
!
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateTensorRTOp
(
node
,
graph
,
graph_param_names
,
&
repetitive_params
);
CreateTensorRTOp
(
node
,
graph
,
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
begin
(),
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
begin
(),
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
end
());
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
end
());
...
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
...
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine
->
SetWithErnie
(
trt_engine
->
SetWithErnie
(
graph
->
Has
(
framework
::
ir
::
kEmbEltwiseLayernormPass
)
&&
graph
->
Has
(
framework
::
ir
::
kEmbEltwiseLayernormPass
)
&&
graph
->
Has
(
framework
::
ir
::
kMultiheadMatmulPass
));
graph
->
Has
(
framework
::
ir
::
kMultiheadMatmulPass
));
trt_engine
->
SetContextMemorySharing
(
Get
<
bool
>
(
"context_memory_sharing"
));
if
(
use_static_engine
)
{
if
(
use_static_engine
)
{
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
173b39bb
...
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
...
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
collect_shape_range_info_
);
CP_MEMBER
(
collect_shape_range_info_
);
CP_MEMBER
(
shape_range_info_path_
);
CP_MEMBER
(
shape_range_info_path_
);
CP_MEMBER
(
trt_use_inspector_
);
CP_MEMBER
(
trt_use_inspector_
);
CP_MEMBER
(
trt_engine_memory_sharing_
);
// Dlnne related
// Dlnne related
CP_MEMBER
(
use_dlnne_
);
CP_MEMBER
(
use_dlnne_
);
CP_MEMBER
(
dlnne_min_subgraph_size_
);
CP_MEMBER
(
dlnne_min_subgraph_size_
);
...
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
...
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
}
}
use_tensorrt_
=
true
;
use_tensorrt_
=
true
;
#if PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_
=
true
;
#else
LOG
(
WARNING
)
<<
"TensorRT engine context memory sharing needs version 7.2 and after."
;
trt_engine_memory_sharing_
=
false
;
#endif
#endif
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
...
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
...
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
// TODO(Superjomn) refactor this, buggy.
// TODO(Superjomn) refactor this, buggy.
void
AnalysisConfig
::
Update
()
{
void
AnalysisConfig
::
Update
()
{
auto
info
=
SerializeInfoCache
();
auto
&&
info
=
SerializeInfoCache
();
if
(
info
==
serialized_info_cache_
)
return
;
if
(
info
==
serialized_info_cache_
)
return
;
// Transfer pass_builder and copy the existing compatible passes.
// Transfer pass_builder and copy the existing compatible passes.
...
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
...
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss
<<
trt_dla_core_
;
ss
<<
trt_dla_core_
;
ss
<<
enable_memory_optim_
;
ss
<<
enable_memory_optim_
;
ss
<<
trt_engine_memory_sharing_
;
ss
<<
use_mkldnn_
;
ss
<<
use_mkldnn_
;
ss
<<
mkldnn_cache_capacity_
;
ss
<<
mkldnn_cache_capacity_
;
...
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
...
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
return
enable_memory_optim_
;
return
enable_memory_optim_
;
}
}
bool
AnalysisConfig
::
trt_engine_memory_sharing
()
const
{
return
trt_engine_memory_sharing_
;
}
void
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
void
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
size_t
prog_buffer_size
,
const
char
*
param_buffer
,
const
char
*
param_buffer
,
...
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
...
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
if
(
trt_use_dla_
)
{
if
(
trt_use_dla_
)
{
os
.
InsertRow
({
"tensorrt_dla_core"
,
std
::
to_string
(
trt_dla_core_
)});
os
.
InsertRow
({
"tensorrt_dla_core"
,
std
::
to_string
(
trt_dla_core_
)});
}
}
os
.
InsertRow
({
"trt_engine_memory_sharing"
,
trt_engine_memory_sharing_
?
"true"
:
"false"
});
#endif
#endif
}
}
}
}
...
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
...
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
shape_range_info_path_
=
shape_range_info_path
;
shape_range_info_path_
=
shape_range_info_path
;
}
}
const
std
::
string
&
AnalysisConfig
::
shape_range_info_path
()
{
const
std
::
string
&
AnalysisConfig
::
shape_range_info_path
()
const
{
return
shape_range_info_path_
;
return
shape_range_info_path_
;
}
}
bool
AnalysisConfig
::
shape_range_info_collected
()
{
bool
AnalysisConfig
::
shape_range_info_collected
()
const
{
return
collect_shape_range_info_
;
return
collect_shape_range_info_
;
}
}
...
@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
...
@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
trt_tuned_dynamic_shape_
=
true
;
trt_tuned_dynamic_shape_
=
true
;
}
}
bool
AnalysisConfig
::
tuned_tensorrt_dynamic_shape
()
{
bool
AnalysisConfig
::
tuned_tensorrt_dynamic_shape
()
const
{
return
trt_tuned_dynamic_shape_
;
return
trt_tuned_dynamic_shape_
;
}
}
bool
AnalysisConfig
::
trt_allow_build_at_runtime
()
{
bool
AnalysisConfig
::
trt_allow_build_at_runtime
()
const
{
return
trt_allow_build_at_runtime_
;
return
trt_allow_build_at_runtime_
;
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.cc
100755 → 100644
浏览文件 @
173b39bb
...
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
...
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_
.
SetTensorRtAllowBuildAtRuntime
(
argument_
.
SetTensorRtAllowBuildAtRuntime
(
config_
.
trt_allow_build_at_runtime
());
config_
.
trt_allow_build_at_runtime
());
argument_
.
SetTensorRtUseInspector
(
config_
.
trt_use_inspector_
);
argument_
.
SetTensorRtUseInspector
(
config_
.
trt_use_inspector_
);
argument_
.
SetTrtEngineMemorySharing
(
config_
.
trt_engine_memory_sharing
());
}
}
if
(
config_
.
dlnne_enabled
())
{
if
(
config_
.
dlnne_enabled
())
{
...
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
...
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
memory
::
Release
(
place_
);
memory
::
Release
(
place_
);
}
}
device_contexts_
.
clear
();
device_contexts_
.
clear
();
#ifdef PADDLE_WITH_TENSORRT
if
(
config_
.
trt_engine_memory_sharing
())
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
releaseContextMemory
(
predictor_id_
);
}
#endif
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
(
void
*
stream
)
{
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
(
void
*
stream
)
{
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
173b39bb
...
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
///
///
/// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated.
///
/// \return bool Whether the tensorrt engine memory sharing is activated.
///
bool
trt_engine_memory_sharing
()
const
;
///
/// \brief Get the TensorRT engine precision.
/// \brief Get the TensorRT engine precision.
///
///
/// \return Precision Get the TensorRT engine precision.
/// \return Precision Get the TensorRT engine precision.
...
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// shape.
/// shape.
///
///
bool
tuned_tensorrt_dynamic_shape
();
bool
tuned_tensorrt_dynamic_shape
()
const
;
///
///
/// \brief A boolean state telling whether to allow building trt engine at
/// \brief A boolean state telling whether to allow building trt engine at
/// runtime.
/// runtime.
///
///
bool
trt_allow_build_at_runtime
();
bool
trt_allow_build_at_runtime
()
const
;
///
///
/// \brief Set execution stream. If not set a stream will be created
/// \brief Set execution stream. If not set a stream will be created
...
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
/// \return the shape info path.
/// \return the shape info path.
///
///
const
std
::
string
&
shape_range_info_path
();
const
std
::
string
&
shape_range_info_path
()
const
;
///
///
/// \brief A boolean state telling whether to collect shape info.
/// \brief A boolean state telling whether to collect shape info.
///
///
/// \return bool Whether to collect shape info.
/// \return bool Whether to collect shape info.
///
///
bool
shape_range_info_collected
();
bool
shape_range_info_collected
()
const
;
///
///
/// \brief Prevent ops running in Paddle-TRT
/// \brief Prevent ops running in Paddle-TRT
...
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related.
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
enable_memory_optim_
{
false
};
bool
trt_engine_memory_sharing_
{
false
};
bool
use_mkldnn_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
173b39bb
...
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
...
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
optim_profiles_
[
i
]
=
infer_builder_
->
createOptimizationProfile
();
optim_profiles_
[
i
]
=
infer_builder_
->
createOptimizationProfile
();
}
}
nvinfer1
::
IExecutionContext
*
TensorRTEngine
::
context
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
infer_context_
.
find
(
predictor_id_per_thread
)
==
infer_context_
.
end
())
{
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
nvinfer1
::
IExecutionContext
*
infer_context
{
nullptr
};
if
(
context_memory_sharing_
)
{
infer_context
=
infer_engine_
->
createExecutionContextWithoutDeviceMemory
();
}
else
{
infer_context
=
infer_engine_
->
createExecutionContext
();
}
PADDLE_ENFORCE_NOT_NULL
(
infer_context
,
platform
::
errors
::
InvalidArgument
(
"TensorRT engine can not build execution context."
));
if
(
with_dynamic_shape_
)
{
// need new profile if it's not the first
if
(
cur_profile_num_
>
0
)
{
infer_context
->
setOptimizationProfile
(
cur_profile_num_
);
}
profile_index_
[
predictor_id_per_thread
]
=
cur_profile_num_
;
++
cur_profile_num_
;
}
infer_context_
[
predictor_id_per_thread
].
reset
(
infer_context
);
}
return
infer_context_
[
predictor_id_per_thread
].
get
();
}
void
TensorRTEngine
::
Execute
(
int
batch_size
,
void
TensorRTEngine
::
Execute
(
int
batch_size
,
std
::
vector
<
void
*>
*
buffers
,
std
::
vector
<
void
*>
*
buffers
,
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
freshDeviceId
();
freshDeviceId
();
auto
infer_context
=
context
();
auto
infer_context
=
context
();
if
(
context_memory_sharing_
)
{
void
*
context_memory
{
nullptr
};
context_memory
=
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
getContextMemory
(
predictor_id_per_thread
,
phi
::
GPUPlace
(
device_id_
),
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
infer_context
->
setDeviceMemory
(
context_memory
);
}
if
(
!
with_dynamic_shape
())
{
if
(
!
with_dynamic_shape
())
{
infer_context
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
infer_context
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
}
else
{
}
else
{
...
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
...
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
infer_context_
.
clear
();
infer_context_
.
clear
();
cur_profile_num_
=
0
;
cur_profile_num_
=
0
;
}
}
// for engine context memory sharing
if
(
context_memory_sharing_
)
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
updateContextMemorySize
(
infer_engine_
->
getDeviceMemorySize
(),
predictor_id_per_thread
);
}
GetEngineInfo
();
GetEngineInfo
();
}
}
...
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
...
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
return
&
itensor_map_
;
return
&
itensor_map_
;
}
}
void
TensorRTEngine
::
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
if
(
use_dla_
)
{
if
(
precision_
!=
AnalysisConfig
::
Precision
::
kInt8
&&
precision_
!=
AnalysisConfig
::
Precision
::
kHalf
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."
;
}
else
if
(
runtime
->
getNbDLACores
()
==
0
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."
;
}
else
{
if
(
dla_core_
<
0
||
dla_core_
>=
runtime
->
getNbDLACores
())
{
dla_core_
=
0
;
LOG
(
WARNING
)
<<
"Invalid DLACore, must be 0 < DLACore < "
<<
runtime
->
getNbDLACores
()
<<
", but got "
<<
dla_core_
<<
", so use use 0 as default."
;
}
runtime
->
setDLACore
(
dla_core_
);
LOG
(
INFO
)
<<
"TensorRT DLA enabled in Deserialize(), DLACore "
<<
dla_core_
;
}
}
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
()));
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
Fatal
(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:
\n
1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;
\n
2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
// for engine context memory sharing
if
(
context_memory_sharing_
)
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
updateContextMemorySize
(
infer_engine_
->
getDeviceMemorySize
(),
predictor_id_per_thread
);
}
GetEngineInfo
();
}
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
runtime_batch_
=
batch_size
;
runtime_batch_
=
batch_size
;
}
}
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
173b39bb
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <NvInfer.h>
#include <NvInfer.h>
#include <cstdint>
#include <map>
#include <map>
#include <memory>
#include <memory>
#include <mutex> // NOLINT
#include <mutex> // NOLINT
...
@@ -37,6 +38,8 @@ limitations under the License. */
...
@@ -37,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h"
#include "paddle/utils/any.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -287,51 +290,10 @@ class TensorRTEngine {
...
@@ -287,51 +290,10 @@ class TensorRTEngine {
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ITensor
*>*
GetITensorMap
();
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ITensor
*>*
GetITensorMap
();
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
IExecutionContext
*
context
()
{
nvinfer1
::
IExecutionContext
*
context
();
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
infer_context_
.
find
(
predictor_id_per_thread
)
==
infer_context_
.
end
())
{
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
infer_context_
[
predictor_id_per_thread
].
reset
(
infer_engine_
->
createExecutionContext
());
if
(
with_dynamic_shape_
)
{
// need new profile if it's not the first
if
(
cur_profile_num_
>
0
)
{
infer_context_
[
predictor_id_per_thread
]
->
setOptimizationProfile
(
cur_profile_num_
);
}
profile_index_
[
predictor_id_per_thread
]
=
cur_profile_num_
;
++
cur_profile_num_
;
}
}
return
infer_context_
[
predictor_id_per_thread
].
get
();
}
int
GetProfileIndex
()
{
int
GetProfileIndex
()
{
if
(
max_profile_num_
>
1
)
{
if
(
max_profile_num_
>
1
)
{
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
profile_index_
[
predictor_id_per_thread
];
return
profile_index_
[
predictor_id_per_thread
];
}
else
{
}
else
{
...
@@ -350,15 +312,6 @@ class TensorRTEngine {
...
@@ -350,15 +312,6 @@ class TensorRTEngine {
infer_engine_
,
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
"You should build engine first and then set the context."
));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
infer_context_
[
predictor_id_per_thread
].
reset
(
nullptr
);
infer_context_
[
predictor_id_per_thread
].
reset
(
nullptr
);
infer_context_
.
erase
(
predictor_id_per_thread
);
infer_context_
.
erase
(
predictor_id_per_thread
);
...
@@ -380,47 +333,7 @@ class TensorRTEngine {
...
@@ -380,47 +333,7 @@ class TensorRTEngine {
return
ihost_memory_
.
get
();
return
ihost_memory_
.
get
();
}
}
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
);
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
if
(
use_dla_
)
{
if
(
precision_
!=
AnalysisConfig
::
Precision
::
kInt8
&&
precision_
!=
AnalysisConfig
::
Precision
::
kHalf
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."
;
}
else
if
(
runtime
->
getNbDLACores
()
==
0
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."
;
}
else
{
if
(
dla_core_
<
0
||
dla_core_
>=
runtime
->
getNbDLACores
())
{
dla_core_
=
0
;
LOG
(
WARNING
)
<<
"Invalid DLACore, must be 0 < DLACore < "
<<
runtime
->
getNbDLACores
()
<<
", but got "
<<
dla_core_
<<
", so use use 0 as default."
;
}
runtime
->
setDLACore
(
dla_core_
);
LOG
(
INFO
)
<<
"TensorRT DLA enabled in Deserialize(), DLACore "
<<
dla_core_
;
}
}
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
()));
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
Fatal
(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:
\n
1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;
\n
2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
GetEngineInfo
();
}
void
SetRuntimeBatch
(
size_t
batch_size
);
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
int
GetRuntimeBatch
();
...
@@ -694,6 +607,10 @@ class TensorRTEngine {
...
@@ -694,6 +607,10 @@ class TensorRTEngine {
void
SetUseInspector
(
bool
use_inspector
)
{
use_inspector_
=
use_inspector
;
}
void
SetUseInspector
(
bool
use_inspector
)
{
use_inspector_
=
use_inspector
;
}
void
SetScope
(
const
framework
::
Scope
&
scope
)
{
scope_
=
&
scope
;
}
void
SetScope
(
const
framework
::
Scope
&
scope
)
{
scope_
=
&
scope
;
}
void
SetContextMemorySharing
(
bool
context_memory_sharing
)
{
context_memory_sharing_
=
context_memory_sharing
;
}
private:
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// ensure that the thread is associated with the correct device by calling
...
@@ -714,6 +631,9 @@ class TensorRTEngine {
...
@@ -714,6 +631,9 @@ class TensorRTEngine {
// batch size of the current data, will be updated each Executation.
// batch size of the current data, will be updated each Executation.
int
batch_size_
{
-
1
};
int
batch_size_
{
-
1
};
// use for engine context memory sharing
bool
context_memory_sharing_
{
false
};
int
device_id_
;
int
device_id_
;
int
max_profile_num_
{
1
};
int
max_profile_num_
{
1
};
int
cur_profile_num_
{
0
};
int
cur_profile_num_
{
0
};
...
@@ -791,14 +711,23 @@ class TensorRTEngine {
...
@@ -791,14 +711,23 @@ class TensorRTEngine {
engine__->network()->add##layer__(__VA_ARGS__)
engine__->network()->add##layer__(__VA_ARGS__)
class
TRTEngineManager
{
class
TRTEngineManager
{
using
PredictorID
=
int
;
using
AllocationPtr
=
phi
::
Allocator
::
AllocationPtr
;
public:
public:
bool
Empty
()
const
{
return
engines_
.
size
()
==
0
;
}
bool
Empty
()
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
engines_
.
size
()
==
0
;
}
bool
Has
(
const
std
::
string
&
name
)
const
{
bool
Has
(
const
std
::
string
&
name
)
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
}
}
TensorRTEngine
*
Get
(
const
std
::
string
&
name
)
const
{
TensorRTEngine
*
Get
(
const
std
::
string
&
name
)
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
engines_
.
at
(
name
).
get
();
return
engines_
.
at
(
name
).
get
();
}
}
...
@@ -826,17 +755,21 @@ class TRTEngineManager {
...
@@ -826,17 +755,21 @@ class TRTEngineManager {
disable_trt_plugin_fp16
,
disable_trt_plugin_fp16
,
model_precision
,
model_precision
,
logger
);
logger
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
engines_
[
name
].
reset
(
p
);
engines_
[
name
].
reset
(
p
);
return
p
;
return
p
;
}
}
void
DeleteAll
()
{
void
DeleteAll
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
for
(
auto
&
item
:
engines_
)
{
for
(
auto
&
item
:
engines_
)
{
item
.
second
.
reset
(
nullptr
);
item
.
second
.
reset
(
nullptr
);
}
}
engines_
.
clear
();
}
}
void
DeleteKey
(
const
std
::
string
&
key
)
{
void
DeleteKey
(
const
std
::
string
&
key
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
engines_
.
find
(
key
);
auto
iter
=
engines_
.
find
(
key
);
if
(
iter
!=
engines_
.
end
())
{
if
(
iter
!=
engines_
.
end
())
{
iter
->
second
.
reset
(
nullptr
);
iter
->
second
.
reset
(
nullptr
);
...
@@ -844,7 +777,57 @@ class TRTEngineManager {
...
@@ -844,7 +777,57 @@ class TRTEngineManager {
}
}
}
}
void
updateContextMemorySize
(
size_t
mem_size
,
PredictorID
predictor_id
)
{
bool
size_updated
{
false
};
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
max_ctx_mem_size_
<
mem_size
)
{
max_ctx_mem_size_
=
mem_size
;
size_updated
=
true
;
}
}
if
(
size_updated
)
{
releaseContextMemory
(
predictor_id
);
}
}
void
*
getContextMemory
(
PredictorID
predictor_id
,
const
phi
::
GPUPlace
&
place
,
const
phi
::
Stream
&
stream
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
static
auto
alignment
=
getAlignmentSize
(
place
);
if
(
context_memorys_
.
count
(
predictor_id
)
==
0
)
{
auto
context_memory
=
memory
::
Alloc
(
place
,
max_ctx_mem_size_
+
alignment
,
stream
);
// context_memory_[predictor_id].reset(context_memory.release());
context_memorys_
[
predictor_id
]
=
std
::
move
(
context_memory
);
}
return
getAlignedMemory
(
context_memorys_
[
predictor_id
]
->
ptr
(),
alignment
);
}
void
releaseContextMemory
(
PredictorID
predictor_id
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
context_memorys_
.
count
(
predictor_id
))
{
context_memorys_
[
predictor_id
].
reset
(
nullptr
);
context_memorys_
.
erase
(
predictor_id
);
}
}
private:
private:
size_t
getAlignmentSize
(
const
phi
::
GPUPlace
&
place
)
{
const
auto
&
prop
=
platform
::
GetDeviceProperties
(
place
.
GetDeviceId
());
return
prop
.
textureAlignment
;
}
void
*
getAlignedMemory
(
void
*
addr
,
size_t
alignment
)
{
return
reinterpret_cast
<
void
*>
(
uintptr_t
(
addr
)
&
(
~
(
alignment
-
1
)));
}
mutable
std
::
mutex
mutex_
;
size_t
max_ctx_mem_size_
{
0
};
std
::
unordered_map
<
PredictorID
,
AllocationPtr
>
context_memorys_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
};
};
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
173b39bb
...
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
std
::
vector
<
std
::
string
>
output_maps
=
std
::
vector
<
std
::
string
>
output_maps
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
int
num_inputs
=
0
;
// Get the total over all profiles
num_inputs
+=
runtime_input_names_
.
size
();
// const int num_bindings = num_inputs + Outputs("Ys").size();
// std::vector<void *> buffers(num_bindings);
// This method returns the total over all profiles.
const
int
num_bindings
=
engine
->
GetNbBindings
();
const
int
num_bindings
=
engine
->
GetNbBindings
();
std
::
vector
<
void
*>
buffers
(
num_bindings
,
nullptr
);
std
::
vector
<
void
*>
buffers
(
num_bindings
,
nullptr
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录