Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
173b39bb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
173b39bb
编写于
9月 22, 2022
作者:
Y
Yuanle Liu
提交者:
GitHub
9月 22, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
TensorRT engine context memory sharing (#45842)
上级
d772166c
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
227 addition
and
112 deletion
+227
-112
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+1
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+2
-1
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+1
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+26
-5
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+12
-4
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+99
-0
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+77
-94
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+1
-6
未找到文件。
paddle/fluid/inference/analysis/argument.h
浏览文件 @
173b39bb
...
@@ -314,6 +314,7 @@ struct Argument {
...
@@ -314,6 +314,7 @@ struct Argument {
// Memory optimized related.
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
trt_engine_memory_sharing
,
TrtEngineMemorySharing
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
// optimization relays on the sort algorithm.
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
173b39bb
...
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"use_calib_mode"
,
new
bool
(
use_calib_mode
));
pass
->
Set
(
"use_calib_mode"
,
new
bool
(
use_calib_mode
));
pass
->
Set
(
"precision_mode"
,
pass
->
Set
(
"precision_mode"
,
new
AnalysisConfig
::
Precision
(
precision_mode
));
new
AnalysisConfig
::
Precision
(
precision_mode
));
pass
->
Set
(
"context_memory_sharing"
,
new
bool
(
argument
->
trt_engine_memory_sharing
()));
bool
use_static_engine
=
argument
->
tensorrt_use_static_engine
();
bool
use_static_engine
=
argument
->
tensorrt_use_static_engine
();
bool
model_from_memory
=
argument
->
model_from_memory
();
bool
model_from_memory
=
argument
->
model_from_memory
();
std
::
string
optim_cache_dir
=
argument
->
optim_cache_dir
();
std
::
string
optim_cache_dir
=
argument
->
optim_cache_dir
();
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
173b39bb
...
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in
// those parameter already exist in trt, and should not have another copy in
// fluid.
// fluid.
std
::
vector
<
std
::
string
>
repetitive_params
;
std
::
vector
<
std
::
string
>
repetitive_params
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
!
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
empty
())
{
if
(
node
->
IsOp
()
&&
!
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
empty
())
{
CreateTensorRTOp
(
node
,
graph
,
graph_param_names
,
&
repetitive_params
);
CreateTensorRTOp
(
node
,
graph
,
graph_param_names
,
&
repetitive_params
);
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
std
::
unordered_set
<
const
Node
*>
nodes2remove
(
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
begin
(),
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
begin
(),
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
end
());
framework
::
ir
::
Agent
(
node
).
subgraph
()
->
end
());
...
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
...
@@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine
->
SetWithErnie
(
trt_engine
->
SetWithErnie
(
graph
->
Has
(
framework
::
ir
::
kEmbEltwiseLayernormPass
)
&&
graph
->
Has
(
framework
::
ir
::
kEmbEltwiseLayernormPass
)
&&
graph
->
Has
(
framework
::
ir
::
kMultiheadMatmulPass
));
graph
->
Has
(
framework
::
ir
::
kMultiheadMatmulPass
));
trt_engine
->
SetContextMemorySharing
(
Get
<
bool
>
(
"context_memory_sharing"
));
if
(
use_static_engine
)
{
if
(
use_static_engine
)
{
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
trt_engine_serialized_data
=
GetTrtEngineSerializedData
(
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
173b39bb
...
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
...
@@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
collect_shape_range_info_
);
CP_MEMBER
(
collect_shape_range_info_
);
CP_MEMBER
(
shape_range_info_path_
);
CP_MEMBER
(
shape_range_info_path_
);
CP_MEMBER
(
trt_use_inspector_
);
CP_MEMBER
(
trt_use_inspector_
);
CP_MEMBER
(
trt_engine_memory_sharing_
);
// Dlnne related
// Dlnne related
CP_MEMBER
(
use_dlnne_
);
CP_MEMBER
(
use_dlnne_
);
CP_MEMBER
(
dlnne_min_subgraph_size_
);
CP_MEMBER
(
dlnne_min_subgraph_size_
);
...
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
...
@@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine(
}
}
use_tensorrt_
=
true
;
use_tensorrt_
=
true
;
#if PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_
=
true
;
#else
LOG
(
WARNING
)
<<
"TensorRT engine context memory sharing needs version 7.2 and after."
;
trt_engine_memory_sharing_
=
false
;
#endif
#endif
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
...
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
...
@@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
// TODO(Superjomn) refactor this, buggy.
// TODO(Superjomn) refactor this, buggy.
void
AnalysisConfig
::
Update
()
{
void
AnalysisConfig
::
Update
()
{
auto
info
=
SerializeInfoCache
();
auto
&&
info
=
SerializeInfoCache
();
if
(
info
==
serialized_info_cache_
)
return
;
if
(
info
==
serialized_info_cache_
)
return
;
// Transfer pass_builder and copy the existing compatible passes.
// Transfer pass_builder and copy the existing compatible passes.
...
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
...
@@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss
<<
trt_dla_core_
;
ss
<<
trt_dla_core_
;
ss
<<
enable_memory_optim_
;
ss
<<
enable_memory_optim_
;
ss
<<
trt_engine_memory_sharing_
;
ss
<<
use_mkldnn_
;
ss
<<
use_mkldnn_
;
ss
<<
mkldnn_cache_capacity_
;
ss
<<
mkldnn_cache_capacity_
;
...
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
...
@@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const {
return
enable_memory_optim_
;
return
enable_memory_optim_
;
}
}
bool
AnalysisConfig
::
trt_engine_memory_sharing
()
const
{
return
trt_engine_memory_sharing_
;
}
void
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
void
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
size_t
prog_buffer_size
,
const
char
*
param_buffer
,
const
char
*
param_buffer
,
...
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
...
@@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() {
if
(
trt_use_dla_
)
{
if
(
trt_use_dla_
)
{
os
.
InsertRow
({
"tensorrt_dla_core"
,
std
::
to_string
(
trt_dla_core_
)});
os
.
InsertRow
({
"tensorrt_dla_core"
,
std
::
to_string
(
trt_dla_core_
)});
}
}
os
.
InsertRow
({
"trt_engine_memory_sharing"
,
trt_engine_memory_sharing_
?
"true"
:
"false"
});
#endif
#endif
}
}
}
}
...
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
...
@@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo(
shape_range_info_path_
=
shape_range_info_path
;
shape_range_info_path_
=
shape_range_info_path
;
}
}
const
std
::
string
&
AnalysisConfig
::
shape_range_info_path
()
{
const
std
::
string
&
AnalysisConfig
::
shape_range_info_path
()
const
{
return
shape_range_info_path_
;
return
shape_range_info_path_
;
}
}
bool
AnalysisConfig
::
shape_range_info_collected
()
{
bool
AnalysisConfig
::
shape_range_info_collected
()
const
{
return
collect_shape_range_info_
;
return
collect_shape_range_info_
;
}
}
...
@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
...
@@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape(
trt_tuned_dynamic_shape_
=
true
;
trt_tuned_dynamic_shape_
=
true
;
}
}
bool
AnalysisConfig
::
tuned_tensorrt_dynamic_shape
()
{
bool
AnalysisConfig
::
tuned_tensorrt_dynamic_shape
()
const
{
return
trt_tuned_dynamic_shape_
;
return
trt_tuned_dynamic_shape_
;
}
}
bool
AnalysisConfig
::
trt_allow_build_at_runtime
()
{
bool
AnalysisConfig
::
trt_allow_build_at_runtime
()
const
{
return
trt_allow_build_at_runtime_
;
return
trt_allow_build_at_runtime_
;
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.cc
100755 → 100644
浏览文件 @
173b39bb
...
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
...
@@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_
.
SetTensorRtAllowBuildAtRuntime
(
argument_
.
SetTensorRtAllowBuildAtRuntime
(
config_
.
trt_allow_build_at_runtime
());
config_
.
trt_allow_build_at_runtime
());
argument_
.
SetTensorRtUseInspector
(
config_
.
trt_use_inspector_
);
argument_
.
SetTensorRtUseInspector
(
config_
.
trt_use_inspector_
);
argument_
.
SetTrtEngineMemorySharing
(
config_
.
trt_engine_memory_sharing
());
}
}
if
(
config_
.
dlnne_enabled
())
{
if
(
config_
.
dlnne_enabled
())
{
...
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
...
@@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() {
memory
::
Release
(
place_
);
memory
::
Release
(
place_
);
}
}
device_contexts_
.
clear
();
device_contexts_
.
clear
();
#ifdef PADDLE_WITH_TENSORRT
if
(
config_
.
trt_engine_memory_sharing
())
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
releaseContextMemory
(
predictor_id_
);
}
#endif
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
(
void
*
stream
)
{
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
(
void
*
stream
)
{
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
173b39bb
...
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
///
///
/// \brief A boolean state telling whether the tensorrt engine memory sharing
/// is activated.
///
/// \return bool Whether the tensorrt engine memory sharing is activated.
///
bool
trt_engine_memory_sharing
()
const
;
///
/// \brief Get the TensorRT engine precision.
/// \brief Get the TensorRT engine precision.
///
///
/// \return Precision Get the TensorRT engine precision.
/// \return Precision Get the TensorRT engine precision.
...
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// shape.
/// shape.
///
///
bool
tuned_tensorrt_dynamic_shape
();
bool
tuned_tensorrt_dynamic_shape
()
const
;
///
///
/// \brief A boolean state telling whether to allow building trt engine at
/// \brief A boolean state telling whether to allow building trt engine at
/// runtime.
/// runtime.
///
///
bool
trt_allow_build_at_runtime
();
bool
trt_allow_build_at_runtime
()
const
;
///
///
/// \brief Set execution stream. If not set a stream will be created
/// \brief Set execution stream. If not set a stream will be created
...
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig {
///
///
/// \return the shape info path.
/// \return the shape info path.
///
///
const
std
::
string
&
shape_range_info_path
();
const
std
::
string
&
shape_range_info_path
()
const
;
///
///
/// \brief A boolean state telling whether to collect shape info.
/// \brief A boolean state telling whether to collect shape info.
///
///
/// \return bool Whether to collect shape info.
/// \return bool Whether to collect shape info.
///
///
bool
shape_range_info_collected
();
bool
shape_range_info_collected
()
const
;
///
///
/// \brief Prevent ops running in Paddle-TRT
/// \brief Prevent ops running in Paddle-TRT
...
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
...
@@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig {
// memory reuse related.
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
enable_memory_optim_
{
false
};
bool
trt_engine_memory_sharing_
{
false
};
bool
use_mkldnn_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
173b39bb
...
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
...
@@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() {
optim_profiles_
[
i
]
=
infer_builder_
->
createOptimizationProfile
();
optim_profiles_
[
i
]
=
infer_builder_
->
createOptimizationProfile
();
}
}
nvinfer1
::
IExecutionContext
*
TensorRTEngine
::
context
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
infer_context_
.
find
(
predictor_id_per_thread
)
==
infer_context_
.
end
())
{
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
nvinfer1
::
IExecutionContext
*
infer_context
{
nullptr
};
if
(
context_memory_sharing_
)
{
infer_context
=
infer_engine_
->
createExecutionContextWithoutDeviceMemory
();
}
else
{
infer_context
=
infer_engine_
->
createExecutionContext
();
}
PADDLE_ENFORCE_NOT_NULL
(
infer_context
,
platform
::
errors
::
InvalidArgument
(
"TensorRT engine can not build execution context."
));
if
(
with_dynamic_shape_
)
{
// need new profile if it's not the first
if
(
cur_profile_num_
>
0
)
{
infer_context
->
setOptimizationProfile
(
cur_profile_num_
);
}
profile_index_
[
predictor_id_per_thread
]
=
cur_profile_num_
;
++
cur_profile_num_
;
}
infer_context_
[
predictor_id_per_thread
].
reset
(
infer_context
);
}
return
infer_context_
[
predictor_id_per_thread
].
get
();
}
void
TensorRTEngine
::
Execute
(
int
batch_size
,
void
TensorRTEngine
::
Execute
(
int
batch_size
,
std
::
vector
<
void
*>
*
buffers
,
std
::
vector
<
void
*>
*
buffers
,
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
freshDeviceId
();
freshDeviceId
();
auto
infer_context
=
context
();
auto
infer_context
=
context
();
if
(
context_memory_sharing_
)
{
void
*
context_memory
{
nullptr
};
context_memory
=
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
getContextMemory
(
predictor_id_per_thread
,
phi
::
GPUPlace
(
device_id_
),
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
infer_context
->
setDeviceMemory
(
context_memory
);
}
if
(
!
with_dynamic_shape
())
{
if
(
!
with_dynamic_shape
())
{
infer_context
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
infer_context
->
enqueue
(
batch_size
,
buffers
->
data
(),
stream
,
nullptr
);
}
else
{
}
else
{
...
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
...
@@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() {
infer_context_
.
clear
();
infer_context_
.
clear
();
cur_profile_num_
=
0
;
cur_profile_num_
=
0
;
}
}
// for engine context memory sharing
if
(
context_memory_sharing_
)
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
updateContextMemorySize
(
infer_engine_
->
getDeviceMemorySize
(),
predictor_id_per_thread
);
}
GetEngineInfo
();
GetEngineInfo
();
}
}
...
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
...
@@ -417,6 +467,55 @@ std::unordered_map<std::string, nvinfer1::ITensor *>
return
&
itensor_map_
;
return
&
itensor_map_
;
}
}
void
TensorRTEngine
::
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
if
(
use_dla_
)
{
if
(
precision_
!=
AnalysisConfig
::
Precision
::
kInt8
&&
precision_
!=
AnalysisConfig
::
Precision
::
kHalf
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."
;
}
else
if
(
runtime
->
getNbDLACores
()
==
0
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."
;
}
else
{
if
(
dla_core_
<
0
||
dla_core_
>=
runtime
->
getNbDLACores
())
{
dla_core_
=
0
;
LOG
(
WARNING
)
<<
"Invalid DLACore, must be 0 < DLACore < "
<<
runtime
->
getNbDLACores
()
<<
", but got "
<<
dla_core_
<<
", so use use 0 as default."
;
}
runtime
->
setDLACore
(
dla_core_
);
LOG
(
INFO
)
<<
"TensorRT DLA enabled in Deserialize(), DLACore "
<<
dla_core_
;
}
}
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
()));
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
Fatal
(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:
\n
1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;
\n
2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
// for engine context memory sharing
if
(
context_memory_sharing_
)
{
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
updateContextMemorySize
(
infer_engine_
->
getDeviceMemorySize
(),
predictor_id_per_thread
);
}
GetEngineInfo
();
}
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
runtime_batch_
=
batch_size
;
runtime_batch_
=
batch_size
;
}
}
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
173b39bb
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <NvInfer.h>
#include <NvInfer.h>
#include <cstdint>
#include <map>
#include <map>
#include <memory>
#include <memory>
#include <mutex> // NOLINT
#include <mutex> // NOLINT
...
@@ -37,6 +38,8 @@ limitations under the License. */
...
@@ -37,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h"
#include "paddle/utils/any.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
...
@@ -171,7 +174,7 @@ class TRTInt8Calibrator;
/*
/*
* TensorRT Engine.
* TensorRT Engine.
*
*
* There are two alternative ways to use it, one is
to build from a paddle
* There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manually construct the network.
* protobuf model, another way is to manually construct the network.
*/
*/
class
TensorRTEngine
{
class
TensorRTEngine
{
...
@@ -287,51 +290,10 @@ class TensorRTEngine {
...
@@ -287,51 +290,10 @@ class TensorRTEngine {
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ITensor
*>*
GetITensorMap
();
std
::
unordered_map
<
std
::
string
,
nvinfer1
::
ITensor
*>*
GetITensorMap
();
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
IExecutionContext
*
context
()
{
nvinfer1
::
IExecutionContext
*
context
();
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
infer_context_
.
find
(
predictor_id_per_thread
)
==
infer_context_
.
end
())
{
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
infer_context_
[
predictor_id_per_thread
].
reset
(
infer_engine_
->
createExecutionContext
());
if
(
with_dynamic_shape_
)
{
// need new profile if it's not the first
if
(
cur_profile_num_
>
0
)
{
infer_context_
[
predictor_id_per_thread
]
->
setOptimizationProfile
(
cur_profile_num_
);
}
profile_index_
[
predictor_id_per_thread
]
=
cur_profile_num_
;
++
cur_profile_num_
;
}
}
return
infer_context_
[
predictor_id_per_thread
].
get
();
}
int
GetProfileIndex
()
{
int
GetProfileIndex
()
{
if
(
max_profile_num_
>
1
)
{
if
(
max_profile_num_
>
1
)
{
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
profile_index_
[
predictor_id_per_thread
];
return
profile_index_
[
predictor_id_per_thread
];
}
else
{
}
else
{
...
@@ -350,15 +312,6 @@ class TensorRTEngine {
...
@@ -350,15 +312,6 @@ class TensorRTEngine {
infer_engine_
,
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
"You should build engine first and then set the context."
));
#ifndef PADDLE_WITH_TESTING
PADDLE_ENFORCE_GT
(
predictor_id_per_thread
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"thread local var predictor_id_per_thread must be "
"initialized to >= 0, but now predictor_id_per_thread = %d"
,
predictor_id_per_thread
));
#endif
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
infer_context_
[
predictor_id_per_thread
].
reset
(
nullptr
);
infer_context_
[
predictor_id_per_thread
].
reset
(
nullptr
);
infer_context_
.
erase
(
predictor_id_per_thread
);
infer_context_
.
erase
(
predictor_id_per_thread
);
...
@@ -380,47 +333,7 @@ class TensorRTEngine {
...
@@ -380,47 +333,7 @@ class TensorRTEngine {
return
ihost_memory_
.
get
();
return
ihost_memory_
.
get
();
}
}
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
);
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
if
(
use_dla_
)
{
if
(
precision_
!=
AnalysisConfig
::
Precision
::
kInt8
&&
precision_
!=
AnalysisConfig
::
Precision
::
kHalf
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA must be used with int8 or fp16, but you "
"set float32, so DLA is not used."
;
}
else
if
(
runtime
->
getNbDLACores
()
==
0
)
{
LOG
(
WARNING
)
<<
"TensorRT DLA is set by config, but your device does not have "
"DLA, so DLA is not used."
;
}
else
{
if
(
dla_core_
<
0
||
dla_core_
>=
runtime
->
getNbDLACores
())
{
dla_core_
=
0
;
LOG
(
WARNING
)
<<
"Invalid DLACore, must be 0 < DLACore < "
<<
runtime
->
getNbDLACores
()
<<
", but got "
<<
dla_core_
<<
", so use use 0 as default."
;
}
runtime
->
setDLACore
(
dla_core_
);
LOG
(
INFO
)
<<
"TensorRT DLA enabled in Deserialize(), DLACore "
<<
dla_core_
;
}
}
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
()));
PADDLE_ENFORCE_NOT_NULL
(
infer_engine_
,
platform
::
errors
::
Fatal
(
"Building TRT cuda engine failed when deserializing engine info. "
"Please check:
\n
1. Your TRT serialization is generated and loaded "
"on the same GPU architecture;
\n
2. The Paddle Inference version of "
"generating serialization file and doing inference are "
"consistent."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
GetEngineInfo
();
}
void
SetRuntimeBatch
(
size_t
batch_size
);
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
int
GetRuntimeBatch
();
...
@@ -694,6 +607,10 @@ class TensorRTEngine {
...
@@ -694,6 +607,10 @@ class TensorRTEngine {
void
SetUseInspector
(
bool
use_inspector
)
{
use_inspector_
=
use_inspector
;
}
void
SetUseInspector
(
bool
use_inspector
)
{
use_inspector_
=
use_inspector
;
}
void
SetScope
(
const
framework
::
Scope
&
scope
)
{
scope_
=
&
scope
;
}
void
SetScope
(
const
framework
::
Scope
&
scope
)
{
scope_
=
&
scope
;
}
void
SetContextMemorySharing
(
bool
context_memory_sharing
)
{
context_memory_sharing_
=
context_memory_sharing
;
}
private:
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// ensure that the thread is associated with the correct device by calling
...
@@ -714,6 +631,9 @@ class TensorRTEngine {
...
@@ -714,6 +631,9 @@ class TensorRTEngine {
// batch size of the current data, will be updated each Executation.
// batch size of the current data, will be updated each Executation.
int
batch_size_
{
-
1
};
int
batch_size_
{
-
1
};
// use for engine context memory sharing
bool
context_memory_sharing_
{
false
};
int
device_id_
;
int
device_id_
;
int
max_profile_num_
{
1
};
int
max_profile_num_
{
1
};
int
cur_profile_num_
{
0
};
int
cur_profile_num_
{
0
};
...
@@ -791,14 +711,23 @@ class TensorRTEngine {
...
@@ -791,14 +711,23 @@ class TensorRTEngine {
engine__->network()->add##layer__(__VA_ARGS__)
engine__->network()->add##layer__(__VA_ARGS__)
class
TRTEngineManager
{
class
TRTEngineManager
{
using
PredictorID
=
int
;
using
AllocationPtr
=
phi
::
Allocator
::
AllocationPtr
;
public:
public:
bool
Empty
()
const
{
return
engines_
.
size
()
==
0
;
}
bool
Empty
()
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
engines_
.
size
()
==
0
;
}
bool
Has
(
const
std
::
string
&
name
)
const
{
bool
Has
(
const
std
::
string
&
name
)
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
if
(
engines_
.
count
(
name
)
==
0
)
return
false
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
return
engines_
.
at
(
name
).
get
()
!=
nullptr
;
}
}
TensorRTEngine
*
Get
(
const
std
::
string
&
name
)
const
{
TensorRTEngine
*
Get
(
const
std
::
string
&
name
)
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
engines_
.
at
(
name
).
get
();
return
engines_
.
at
(
name
).
get
();
}
}
...
@@ -826,17 +755,21 @@ class TRTEngineManager {
...
@@ -826,17 +755,21 @@ class TRTEngineManager {
disable_trt_plugin_fp16
,
disable_trt_plugin_fp16
,
model_precision
,
model_precision
,
logger
);
logger
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
engines_
[
name
].
reset
(
p
);
engines_
[
name
].
reset
(
p
);
return
p
;
return
p
;
}
}
void
DeleteAll
()
{
void
DeleteAll
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
for
(
auto
&
item
:
engines_
)
{
for
(
auto
&
item
:
engines_
)
{
item
.
second
.
reset
(
nullptr
);
item
.
second
.
reset
(
nullptr
);
}
}
engines_
.
clear
();
}
}
void
DeleteKey
(
const
std
::
string
&
key
)
{
void
DeleteKey
(
const
std
::
string
&
key
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
engines_
.
find
(
key
);
auto
iter
=
engines_
.
find
(
key
);
if
(
iter
!=
engines_
.
end
())
{
if
(
iter
!=
engines_
.
end
())
{
iter
->
second
.
reset
(
nullptr
);
iter
->
second
.
reset
(
nullptr
);
...
@@ -844,7 +777,57 @@ class TRTEngineManager {
...
@@ -844,7 +777,57 @@ class TRTEngineManager {
}
}
}
}
void
updateContextMemorySize
(
size_t
mem_size
,
PredictorID
predictor_id
)
{
bool
size_updated
{
false
};
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
max_ctx_mem_size_
<
mem_size
)
{
max_ctx_mem_size_
=
mem_size
;
size_updated
=
true
;
}
}
if
(
size_updated
)
{
releaseContextMemory
(
predictor_id
);
}
}
void
*
getContextMemory
(
PredictorID
predictor_id
,
const
phi
::
GPUPlace
&
place
,
const
phi
::
Stream
&
stream
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
static
auto
alignment
=
getAlignmentSize
(
place
);
if
(
context_memorys_
.
count
(
predictor_id
)
==
0
)
{
auto
context_memory
=
memory
::
Alloc
(
place
,
max_ctx_mem_size_
+
alignment
,
stream
);
// context_memory_[predictor_id].reset(context_memory.release());
context_memorys_
[
predictor_id
]
=
std
::
move
(
context_memory
);
}
return
getAlignedMemory
(
context_memorys_
[
predictor_id
]
->
ptr
(),
alignment
);
}
void
releaseContextMemory
(
PredictorID
predictor_id
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
context_memorys_
.
count
(
predictor_id
))
{
context_memorys_
[
predictor_id
].
reset
(
nullptr
);
context_memorys_
.
erase
(
predictor_id
);
}
}
private:
private:
size_t
getAlignmentSize
(
const
phi
::
GPUPlace
&
place
)
{
const
auto
&
prop
=
platform
::
GetDeviceProperties
(
place
.
GetDeviceId
());
return
prop
.
textureAlignment
;
}
void
*
getAlignedMemory
(
void
*
addr
,
size_t
alignment
)
{
return
reinterpret_cast
<
void
*>
(
uintptr_t
(
addr
)
&
(
~
(
alignment
-
1
)));
}
mutable
std
::
mutex
mutex_
;
size_t
max_ctx_mem_size_
{
0
};
std
::
unordered_map
<
PredictorID
,
AllocationPtr
>
context_memorys_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
};
};
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
173b39bb
...
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
std
::
vector
<
std
::
string
>
output_maps
=
std
::
vector
<
std
::
string
>
output_maps
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
int
num_inputs
=
0
;
// Get the total over all profiles
num_inputs
+=
runtime_input_names_
.
size
();
// const int num_bindings = num_inputs + Outputs("Ys").size();
// std::vector<void *> buffers(num_bindings);
// This method returns the total over all profiles.
const
int
num_bindings
=
engine
->
GetNbBindings
();
const
int
num_bindings
=
engine
->
GetNbBindings
();
std
::
vector
<
void
*>
buffers
(
num_bindings
,
nullptr
);
std
::
vector
<
void
*>
buffers
(
num_bindings
,
nullptr
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录