Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
339c34e6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
339c34e6
编写于
12月 30, 2021
作者:
W
wenbin
提交者:
GitHub
12月 30, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dynamic shape clone (#38520)
* dynamic shape clone supported
上级
ebc72ac2
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
248 addition
and
54 deletion
+248
-54
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+4
-0
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+36
-0
paddle/fluid/framework/naive_executor.h
paddle/fluid/framework/naive_executor.h
+2
-0
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+11
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+1
-0
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+40
-28
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+35
-1
paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+82
-0
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+36
-23
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
339c34e6
...
...
@@ -275,7 +275,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
variable_helper SRCS variable_helper.cc DEPS lod_tensor
)
if
(
TENSORRT_FOUND
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op
)
else
()
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
endif
(
TENSORRT_FOUND
)
cc_library
(
executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper
)
if
(
WITH_DISTRIBUTE
)
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
339c34e6
...
...
@@ -20,6 +20,9 @@
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -132,5 +135,38 @@ NaiveExecutor::~NaiveExecutor() {
#endif
}
void
NaiveExecutor
::
ResetTrtOps
(
int
num
)
{
#if PADDLE_WITH_TENSORRT
for
(
auto
&
op
:
ops_
)
{
if
(
op
->
Type
()
==
"tensorrt_engine"
)
{
operators
::
TensorRTEngineOp
*
trtop
=
dynamic_cast
<
operators
::
TensorRTEngineOp
*>
(
op
.
get
());
if
(
!
trtop
)
return
;
std
::
string
engine_key
=
trtop
->
Attr
<
std
::
string
>
(
"engine_key"
);
int
engine_predictor_id
=
trtop
->
Attr
<
int
>
(
"predictor_id"
);
std
::
string
engine_name
=
engine_key
+
std
::
to_string
(
engine_predictor_id
);
operators
::
TensorRTEngine
*
trt_engine
=
paddle
::
inference
::
Singleton
<
inference
::
tensorrt
::
TRTEngineManager
>::
Global
()
.
Get
(
engine_name
);
if
(
trt_engine
->
with_dynamic_shape
())
{
LOG
(
INFO
)
<<
"rebuild trt engine, this may cost a lot of time!"
;
trt_engine
->
ResetContext
();
trt_engine
->
ClearTensorMap
();
trt_engine
->
SetProfileNum
(
num
);
auto
*
anc
=
scope_
->
parent
();
while
(
anc
&&
anc
->
parent
())
{
anc
=
anc
->
parent
();
}
if
(
anc
==
nullptr
)
{
anc
=
scope_
;
}
trtop
->
PrepareTRTEngine
(
*
anc
,
trt_engine
);
}
}
}
#endif
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor.h
浏览文件 @
339c34e6
...
...
@@ -63,6 +63,8 @@ class NaiveExecutor {
void
CleanFeedFetchOps
();
void
ResetTrtOps
(
int
num
);
protected:
void
CreateOps
(
const
ProgramDesc
&
desc
,
int
block_id
,
bool
with_feed_fetch_ops
);
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
浏览文件 @
339c34e6
...
...
@@ -56,8 +56,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
bool
reserve_cpu_weights
=
false
;
if
(
argument
->
tensorrt_allow_build_at_runtime_valid
()
&&
argument
->
tensorrt_allow_build_at_runtime
())
{
bool
with_dynamic_shape
=
false
;
if
(
argument
->
Has
(
"max_input_shape"
)
&&
argument
->
Has
(
"min_input_shape"
)
&&
argument
->
Has
(
"optim_input_shape"
))
{
with_dynamic_shape
=
(
argument
->
max_input_shape
().
size
()
>
0
&&
argument
->
min_input_shape
().
size
()
>
0
&&
argument
->
optim_input_shape
().
size
()
>
0
);
}
with_dynamic_shape
=
with_dynamic_shape
||
(
argument
->
Has
(
"tensorrt_tuned_dynamic_shape"
)
&&
argument
->
tensorrt_tuned_dynamic_shape
());
if
(
with_dynamic_shape
)
{
reserve_cpu_weights
=
true
;
}
for
(
auto
&
var_name
:
all_vars
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
339c34e6
...
...
@@ -1344,6 +1344,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
std
::
lock_guard
<
std
::
mutex
>
lk
(
clone_mutex_
);
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
x
->
executor_
->
ResetTrtOps
(
++
x
->
clone_num_
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
339c34e6
...
...
@@ -435,6 +435,7 @@ class AnalysisPredictor : public PaddlePredictor {
bool
status_is_cloned_
{
false
};
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
vector
<
int32_t
>>>
shape_info_
;
int
clone_num_
{
1
};
};
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
339c34e6
...
...
@@ -42,7 +42,10 @@ void TensorRTEngine::InitNetwork() {
}
infer_builder_config_
.
reset
(
infer_builder_
->
createBuilderConfig
());
optim_profile_
=
infer_builder_
->
createOptimizationProfile
();
// optim_profile_ = infer_builder_->createOptimizationProfile();
optim_profiles_
.
resize
(
max_profile_num_
);
for
(
int
i
=
0
;
i
<
max_profile_num_
;
i
++
)
optim_profiles_
[
i
]
=
infer_builder_
->
createOptimizationProfile
();
}
void
TensorRTEngine
::
Execute
(
int
batch_size
,
std
::
vector
<
void
*>
*
buffers
,
...
...
@@ -199,35 +202,38 @@ void TensorRTEngine::FreezeNetwork() {
if
(
with_dynamic_shape_
)
{
#if IS_TRT_VERSION_GE(6000)
LOG
(
INFO
)
<<
"Run Paddle-TRT Dynamic Shape mode."
;
for
(
auto
&
input
:
min_input_shape_
)
{
for
(
int
i
=
0
;
i
<
max_profile_num_
;
i
++
)
{
for
(
auto
&
input
:
min_input_shape_
)
{
#if IS_TRT_VERSION_LT(7000)
// trt6 will check all_of input > 0
if
(
!
(
std
::
all_of
(
input
.
second
.
begin
(),
input
.
second
.
end
(),
[](
int
x
)
{
return
x
>
0
;
})
&&
std
::
all_of
(
max_input_shape_
[
input
.
first
].
begin
(),
max_input_shape_
[
input
.
first
].
end
(),
[](
int
x
)
{
return
x
>
0
;
})
&&
std
::
all_of
(
optim_input_shape_
[
input
.
first
].
begin
(),
optim_input_shape_
[
input
.
first
].
end
(),
[](
int
x
)
{
return
x
>
0
;
})))
{
continue
;
}
// trt6 will check all_of input > 0
if
(
!
(
std
::
all_of
(
input
.
second
.
begin
(),
input
.
second
.
end
(),
[](
int
x
)
{
return
x
>
0
;
})
&&
std
::
all_of
(
max_input_shape_
[
input
.
first
].
begin
(),
max_input_shape_
[
input
.
first
].
end
(),
[](
int
x
)
{
return
x
>
0
;
})
&&
std
::
all_of
(
optim_input_shape_
[
input
.
first
].
begin
(),
optim_input_shape_
[
input
.
first
].
end
(),
[](
int
x
)
{
return
x
>
0
;
})))
{
continue
;
}
#endif
VLOG
(
4
)
<<
"TRT dynamic_shape set "
<<
input
.
first
<<
" min: "
<<
Vec2Str
(
input
.
second
)
<<
", max: "
<<
Vec2Str
(
max_input_shape_
[
input
.
first
])
<<
", opt: "
<<
Vec2Str
(
optim_input_shape_
[
input
.
first
]);
optim_profile_
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kMIN
,
Vec2TRT_Dims
(
input
.
second
,
input
.
first
,
true
));
optim_profile_
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kMAX
,
Vec2TRT_Dims
(
max_input_shape_
[
input
.
first
],
input
.
first
,
true
));
optim_profile_
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kOPT
,
Vec2TRT_Dims
(
optim_input_shape_
[
input
.
first
],
input
.
first
,
true
));
VLOG
(
4
)
<<
"TRT dynamic_shape set "
<<
input
.
first
<<
" min: "
<<
Vec2Str
(
input
.
second
)
<<
", max: "
<<
Vec2Str
(
max_input_shape_
[
input
.
first
])
<<
", opt: "
<<
Vec2Str
(
optim_input_shape_
[
input
.
first
]);
optim_profiles_
[
i
]
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kMIN
,
Vec2TRT_Dims
(
input
.
second
,
input
.
first
,
true
));
optim_profiles_
[
i
]
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kMAX
,
Vec2TRT_Dims
(
max_input_shape_
[
input
.
first
],
input
.
first
,
true
));
optim_profiles_
[
i
]
->
setDimensions
(
input
.
first
.
c_str
(),
nvinfer1
::
OptProfileSelector
::
kOPT
,
Vec2TRT_Dims
(
optim_input_shape_
[
input
.
first
],
input
.
first
,
true
));
}
infer_builder_config_
->
addOptimizationProfile
(
optim_profiles_
[
i
]);
}
infer_builder_config_
->
addOptimizationProfile
(
optim_profile_
);
if
(
WithFp16
()
&&
disable_trt_plugin_fp16
())
{
LOG
(
INFO
)
<<
"NOTE: In order to achieve higher accuracy, you have "
"disabled the fp16 mode of TRT Plugin,
\n
"
...
...
@@ -237,7 +243,6 @@ void TensorRTEngine::FreezeNetwork() {
}
#endif
}
#if IS_TRT_VERSION_GE(8200)
infer_builder_config_
->
setProfilingVerbosity
(
nvinfer1
::
ProfilingVerbosity
::
kDETAILED
);
...
...
@@ -260,6 +265,13 @@ void TensorRTEngine::FreezeNetwork() {
"Build TensorRT cuda engine failed! Please recheck "
"you configurations related to paddle-TensorRT."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
// reset status for dynamic shape clone
if
(
max_profile_num_
>
1
)
{
infer_context_
.
clear
();
cur_profile_num_
=
0
;
}
GetEngineInfo
();
}
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
339c34e6
...
...
@@ -253,10 +253,38 @@ class TensorRTEngine {
infer_engine_
,
platform
::
errors
::
InvalidArgument
(
"You should build engine first and then set the context."
));
// We may see trt warning: Profile 0 has been chosen by another
// IExecutionContext...
// It's ok. We will set it later.
infer_context_
[
tid
].
reset
(
infer_engine_
->
createExecutionContext
());
if
(
with_dynamic_shape_
)
{
// need new profile if it's not the first
if
(
cur_profile_num_
>
0
)
{
infer_context_
[
tid
]
->
setOptimizationProfile
(
cur_profile_num_
);
}
profile_index_
[
tid
]
=
cur_profile_num_
;
++
cur_profile_num_
;
}
}
return
infer_context_
[
tid
].
get
();
}
int
GetProfileIndex
()
{
if
(
max_profile_num_
>
1
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
const
std
::
thread
::
id
tid
=
std
::
this_thread
::
get_id
();
return
profile_index_
[
tid
];
}
else
{
return
0
;
}
}
int
GetBindingsOffset
()
{
return
(
binding_num_
/
max_profile_num_
)
*
GetProfileIndex
();
}
int
GetNbBindings
()
{
return
binding_num_
;
}
void
ResetContext
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
const
std
::
thread
::
id
tid
=
std
::
this_thread
::
get_id
();
...
...
@@ -322,6 +350,7 @@ class TensorRTEngine {
"generating serialization file and doing inference are "
"consistent."
));
binding_num_
=
infer_engine_
->
getNbBindings
();
GetEngineInfo
();
}
...
...
@@ -540,6 +569,7 @@ class TensorRTEngine {
}
}
void
SetProfileNum
(
int
num
)
{
max_profile_num_
=
num
;
}
void
GetEngineInfo
()
{
#if IS_TRT_VERSION_GE(8200)
std
::
unique_ptr
<
nvinfer1
::
IEngineInspector
>
infer_inspector
(
...
...
@@ -571,6 +601,9 @@ class TensorRTEngine {
int
batch_size_
{
-
1
};
int
device_id_
;
int
max_profile_num_
{
1
};
int
cur_profile_num_
{
0
};
std
::
unordered_map
<
std
::
thread
::
id
,
int
>
profile_index_
;
ShapeMapType
min_input_shape_
;
ShapeMapType
max_input_shape_
;
ShapeMapType
optim_input_shape_
;
...
...
@@ -614,8 +647,9 @@ class TensorRTEngine {
// For dynamic shape
bool
with_dynamic_shape_
{
false
};
#if IS_TRT_VERSION_GE(6000)
int
binding_num_
;
infer_ptr
<
nvinfer1
::
IBuilderConfig
>
infer_builder_config_
;
nvinfer1
::
IOptimizationProfile
*
optim_profile
_
;
std
::
vector
<
nvinfer1
::
IOptimizationProfile
*>
optim_profiles
_
;
std
::
vector
<
std
::
unique_ptr
<
plugin
::
DynamicPluginTensorRT
>>
owned_pluginv2_
;
#endif
std
::
mutex
mutex_
;
...
...
paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
浏览文件 @
339c34e6
...
...
@@ -207,6 +207,87 @@ void TestTunedDynamic() {
check_func
(
test_predictor
.
get
());
}
void
TestDynamicClone
(
bool
with_dynamic
=
true
,
bool
delete_cache
=
true
,
bool
delete_conv_bn
=
false
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/conv_bn_swish_split_gelu/conv_bn_swish_split_gelu"
;
std
::
string
opt_cache_dir
=
model_dir
+
"/my_cache"
;
if
(
delete_cache
)
{
delete_cache_files
(
opt_cache_dir
);
}
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
);
std
::
string
buffer_prog
,
buffer_param
;
ReadBinaryFile
(
model_dir
+
"/model"
,
&
buffer_prog
);
ReadBinaryFile
(
model_dir
+
"/params"
,
&
buffer_param
);
config
.
SetModelBuffer
(
&
buffer_prog
[
0
],
buffer_prog
.
size
(),
&
buffer_param
[
0
],
buffer_param
.
size
());
config
.
SetOptimCacheDir
(
opt_cache_dir
);
config
.
SwitchUseFeedFetchOps
(
false
);
// Set the input's min, max, opt shape
config
.
EnableTensorRtEngine
(
1
<<
30
,
1
,
1
,
AnalysisConfig
::
Precision
::
kFloat32
,
false
,
false
);
if
(
delete_conv_bn
)
{
config
.
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
}
if
(
with_dynamic
)
{
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
min_input_shape
=
{
{
"image"
,
{
1
,
1
,
3
,
3
}}};
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
=
{
{
"image"
,
{
1
,
1
,
10
,
10
}}};
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
opt_input_shape
=
{
{
"image"
,
{
1
,
1
,
3
,
3
}}};
config
.
SetTRTDynamicShapeInfo
(
min_input_shape
,
max_input_shape
,
opt_input_shape
);
}
auto
predictor
=
CreatePaddlePredictor
(
config
);
auto
input_names
=
predictor
->
GetInputNames
();
int
channels
=
1
;
int
height
=
3
;
int
width
=
3
;
int
input_num
=
channels
*
height
*
width
*
1
;
float
*
input
=
new
float
[
input_num
];
memset
(
input
,
0
,
input_num
*
sizeof
(
float
));
auto
input_t
=
predictor
->
GetInputTensor
(
input_names
[
0
]);
input_t
->
Reshape
({
1
,
channels
,
height
,
width
});
input_t
->
copy_from_cpu
(
input
);
ASSERT_TRUE
(
predictor
->
ZeroCopyRun
());
std
::
vector
<
float
>
out_data
;
auto
output_names
=
predictor
->
GetOutputNames
();
auto
output_t
=
predictor
->
GetOutputTensor
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape
=
output_t
->
shape
();
int
out_num
=
std
::
accumulate
(
output_shape
.
begin
(),
output_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
out_data
.
resize
(
out_num
);
output_t
->
copy_to_cpu
(
out_data
.
data
());
auto
predictor2
=
predictor
->
Clone
();
auto
input_t2
=
predictor2
->
GetInputTensor
(
input_names
[
0
]);
input_t2
->
Reshape
({
1
,
channels
,
height
,
width
});
input_t2
->
copy_from_cpu
(
input
);
ASSERT_TRUE
(
predictor2
->
ZeroCopyRun
());
std
::
vector
<
float
>
out_data2
;
auto
output_t2
=
predictor2
->
GetOutputTensor
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape2
=
output_t2
->
shape
();
int
out_num2
=
std
::
accumulate
(
output_shape2
.
begin
(),
output_shape2
.
end
(),
1
,
std
::
multiplies
<
int
>
());
out_data2
.
resize
(
out_num2
);
output_t2
->
copy_to_cpu
(
out_data2
.
data
());
ASSERT_TRUE
(
out_data2
.
size
()
==
out_data
.
size
());
for
(
size_t
i
=
0
;
i
<
out_data
.
size
();
i
++
)
{
EXPECT_NEAR
(
out_data2
[
i
],
out_data
[
i
],
1e-5
);
}
}
TEST
(
AnalysisPredictor
,
trt_dynamic
)
{
TestDynamic
(
true
);
}
TEST
(
AnalysisPredictor
,
trt_static
)
{
TestDynamic
(
false
);
}
TEST
(
AnalysisPredictor
,
trt_memory_serialize
)
{
...
...
@@ -218,6 +299,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
TEST
(
AnalysisPredictor
,
trt_dynamic2
)
{
TestDynamic2
();
}
TEST
(
AnalysisPredictor
,
trt_tuned_dynamic
)
{
TestTunedDynamic
();
}
TEST
(
AnalysisPredictor
,
trt_dynamic_clone
)
{
TestDynamicClone
();
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
339c34e6
...
...
@@ -250,6 +250,23 @@ class TensorRTEngineOp : public framework::OperatorBase {
}
}
void
PrepareTRTEngine
(
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
const
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
framework
::
proto
::
BlockDesc
block_proto
;
block_proto
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
std
::
vector
<
std
::
string
>
inputs
=
Inputs
(
"Xs"
);
std
::
vector
<
std
::
string
>
outputs
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlockToTRTEngine
(
&
block_desc
,
scope
,
inputs
,
param_names_
,
outputs
,
engine
);
}
protected:
void
RunNativeImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
...
...
@@ -414,8 +431,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
int
num_inputs
=
0
;
num_inputs
+=
runtime_input_names_
.
size
();
const
int
num_bindings
=
num_inputs
+
Outputs
(
"Ys"
).
size
();
std
::
vector
<
void
*>
buffers
(
num_bindings
);
// const int num_bindings = num_inputs + Outputs("Ys").size();
// std::vector<void *> buffers(num_bindings);
// This method returns the total over all profiles.
const
int
num_bindings
=
engine
->
GetNbBindings
();
std
::
vector
<
void
*>
buffers
(
num_bindings
,
nullptr
);
int
binding_offset
=
0
;
nvinfer1
::
IExecutionContext
*
trt_context
=
nullptr
;
if
(
engine
->
with_dynamic_shape
())
{
// Initilize context and get offset by profile index
trt_context
=
engine
->
context
();
binding_offset
=
engine
->
GetBindingsOffset
();
}
// Bind input tensor to TRT.
for
(
const
auto
&
x
:
runtime_input_names_
)
{
...
...
@@ -430,7 +458,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
t
.
ShareDataWith
(
out
);
}
auto
t_shape
=
framework
::
vectorize
<
int64_t
>
(
t
.
dims
());
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
x
.
c_str
());
// const int bind_index = engine->engine()->getBindingIndex(x.c_str());
// Get index of profile 0 first, then plus binding offset
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
x
.
c_str
())
+
binding_offset
;
PADDLE_ENFORCE_LT
(
bind_index
,
num_bindings
,
platform
::
errors
::
InvalidArgument
(
...
...
@@ -474,7 +505,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
}
}
else
{
#if IS_TRT_VERSION_GE(6000)
auto
*
trt_context
=
engine
->
context
();
trt_context
->
setBindingDimensions
(
bind_index
,
inference
::
tensorrt
::
Vec2TRT_Dims
(
t_shape
,
x
,
true
));
#endif
...
...
@@ -500,7 +530,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
const
int
bind_index
=
engine
->
engine
()
->
getBindingIndex
(
output_maps
[
output_index
].
c_str
());
engine
->
engine
()
->
getBindingIndex
(
output_maps
[
output_index
].
c_str
())
+
binding_offset
;
std
::
vector
<
int
>
ddim
;
if
(
!
engine
->
with_dynamic_shape
())
{
...
...
@@ -511,7 +542,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
}
}
else
{
#if IS_TRT_VERSION_GE(6000)
auto
*
trt_context
=
engine
->
context
();
auto
dims
=
trt_context
->
getBindingDimensions
(
bind_index
);
int
nb_dims
=
dims
.
nbDims
;
for
(;
nb_dims
>
0
;
nb_dims
--
)
{
...
...
@@ -583,23 +613,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
}
return
trt_engine_
;
}
void
PrepareTRTEngine
(
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
const
{
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
framework
::
proto
::
BlockDesc
block_proto
;
block_proto
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
std
::
vector
<
std
::
string
>
inputs
=
Inputs
(
"Xs"
);
std
::
vector
<
std
::
string
>
outputs
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlockToTRTEngine
(
&
block_desc
,
scope
,
inputs
,
param_names_
,
outputs
,
engine
);
}
};
}
// namespace operators
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录