Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
bcbfbbd7
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
bcbfbbd7
编写于
3月 17, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mgb): fix TensorRT runtime opr profiling
GitOrigin-RevId: 3545aa53b2ee215e64d22c89e94171fadb6b11b0
上级
702ed9ee
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
4 addition
and
154 deletion
+4
-154
src/tensorrt/impl/tensorrt_opr.cpp
src/tensorrt/impl/tensorrt_opr.cpp
+2
-44
src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
+2
-2
src/tensorrt/test/tensorrt.cpp
src/tensorrt/test/tensorrt.cpp
+0
-44
src/tensorrt/test/tensorrt_runtime.cpp
src/tensorrt/test/tensorrt_runtime.cpp
+0
-64
未找到文件。
src/tensorrt/impl/tensorrt_opr.cpp
浏览文件 @
bcbfbbd7
...
...
@@ -50,17 +50,6 @@ void TensorRTProfiler::print_layer_times() {
printf
(
"Total time: %4.3fms
\n
"
,
total_time
);
}
std
::
shared_ptr
<
json
::
Value
>
TensorRTProfiler
::
to_json
()
{
using
namespace
json
;
auto
prof_arr
=
Array
::
make
();
for
(
auto
&&
rec
:
profile
)
{
auto
&&
item
=
Array
::
make
();
item
->
add
(
String
::
make
(
rec
.
first
));
item
->
add
(
Number
::
make
(
rec
.
second
));
prof_arr
->
add
(
item
);
}
return
prof_arr
;
}
#endif // MGB_ENABLE_JSON
...
...
@@ -168,7 +157,7 @@ void TensorRTOpr::GpuAllocator::free(void* memory) {
void
TensorRTManager
::
exec
(
cg
::
SingleCNOperatorNodeBase
*
opr
,
CompNode
comp_node_check
,
nvinfer1
::
ICudaEngine
*
engine
,
size_t
batch
)
{
size_t
batch
,
bool
use_trt_profiler
)
{
auto
comp_node
=
opr
->
comp_node
();
// ICudaEngine is bound to the currently active device
...
...
@@ -180,22 +169,11 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
comp_node_check
.
to_string
().
c_str
(),
comp_node
.
to_string
().
c_str
());
}
#if MGB_ENABLE_JSON
auto
pf_holder_pair
=
opr
->
owner_graph
()
->
options
()
.
user_data
.
get_user_data
<
opr_profile
::
OprProfileHolder
>
();
if
(
m_has_profiler
&&
!
pf_holder_pair
.
second
)
{
m_context
.
reset
();
m_has_profiler
=
false
;
}
#endif
auto
workspace_ptr
=
opr
->
output
().
back
()
->
dev_tensor
().
raw_ptr
();
bool
should_reinit_device_memory
=
!
m_context
||
m_device_workspace_memory_ptr
!=
workspace_ptr
;
if
(
!
m_context
)
{
m_context
=
{
engine
->
createExecutionContextWithoutDeviceMemory
(),
{}};
m_has_profiler
=
false
;
}
m_trt_iobuf
.
resize
(
opr
->
input
().
size
()
+
opr
->
output
().
size
()
-
1
);
bool
is_trt_opr
=
false
;
...
...
@@ -235,11 +213,7 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
bool
exec_success
=
false
;
#if MGB_ENABLE_JSON
if
(
!
pf_holder_pair
.
second
)
{
mgb_assert
(
!
m_has_profiler
,
"Invalid state of TensorRTRuntimeOpr: should not have "
"profiler."
);
if
(
!
use_trt_profiler
)
{
#if NV_TENSOR_RT_VERSION >= 6001
if
(
is_trt_opr
)
exec_success
=
m_context
->
enqueueV2
(
m_trt_iobuf
.
data
(),
...
...
@@ -255,7 +229,6 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
}
else
{
TensorRTProfiler
trt_profiler
;
m_context
->
setProfiler
(
&
trt_profiler
);
m_has_profiler
=
true
;
// TensorRT documentation stated that IExecutionContext->execute
// "Synchronously execute inference on a batch", and it does not take a
// cudaStream_t, we expect it do a device synchronize. But it seems like
...
...
@@ -272,24 +245,9 @@ void TensorRTManager::exec(cg::SingleCNOperatorNodeBase* opr,
exec_success
=
m_context
->
execute
(
batch
,
m_trt_iobuf
.
data
());
#endif
mgb_assert
(
exec_success
,
"trt execution failed: opr=%s"
,
opr
->
cname
());
pf_holder_pair
.
first
[
0
]
->
id2object_map
[
opr
]
=
trt_profiler
.
to_json
();
printf
(
"TRT profile info of opr %s:
\n
"
,
opr
->
name
().
c_str
());
trt_profiler
.
print_layer_times
();
}
#else
#if NV_TENSOR_RT_VERSION >= 6001
if
(
is_trt_opr
)
exec_success
=
m_context
->
enqueueV2
(
m_trt_iobuf
.
data
(),
env
.
cuda_env
().
stream
,
nullptr
);
else
exec_success
=
m_context
->
enqueue
(
batch
,
m_trt_iobuf
.
data
(),
env
.
cuda_env
().
stream
,
nullptr
);
#else
exec_success
=
m_context
->
enqueue
(
batch
,
m_trt_iobuf
.
data
(),
env
.
cuda_env
().
stream
,
nullptr
);
#endif
mgb_assert
(
exec_success
,
"trt execution failed: opr=%s"
,
opr
->
cname
());
#endif
}
/* ========================== TensorRTOpr ========================== */
...
...
src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h
浏览文件 @
bcbfbbd7
...
...
@@ -50,11 +50,11 @@ class TensorRTManager {
std
::
vector
<
void
*>
m_trt_iobuf
;
TensorRTUniquePtr
<
nvinfer1
::
IExecutionContext
>
m_context
;
void
*
m_device_workspace_memory_ptr
;
bool
m_has_profiler
;
public:
void
exec
(
cg
::
SingleCNOperatorNodeBase
*
opr
,
CompNode
comp_node_check
,
nvinfer1
::
ICudaEngine
*
engine
,
size_t
batch
=
1
);
nvinfer1
::
ICudaEngine
*
engine
,
size_t
batch
=
1
,
bool
use_trt_profiler
=
false
);
void
clear_trt_context
()
{
m_context
.
reset
();
}
...
...
src/tensorrt/test/tensorrt.cpp
浏览文件 @
bcbfbbd7
...
...
@@ -28,50 +28,6 @@ using namespace mgb;
using
namespace
nvinfer1
;
using
namespace
opr
;
TEST
(
TestOprTensorRT
,
Profile
)
{
REQUIRE_GPU
(
1
);
intl
::
ConcatConvTensorRTNetwork
net
;
auto
p
=
net
.
create_trt_network
(
true
);
auto
y2
=
TensorRTOpr
::
make
(
TensorRTOpr
::
to_shared_ptr_builder
(
p
.
first
),
TensorRTOpr
::
to_shared_ptr_network
(
p
.
second
),
intl
::
TensorRTGraphFeatureBits
::
NCHW_FLOAT
,
{},
{
net
.
x0
,
net
.
x1
})[
0
];
HostTensorND
host_z1
;
HostTensorND
host_z2
;
auto
func
=
net
.
graph
->
compile
({
make_callback_copy
(
net
.
y
,
host_z1
),
make_callback_copy
(
y2
,
host_z2
)});
{
mgb
::
GraphProfiler
profiler
(
net
.
graph
.
get
());
func
->
execute
();
profiler
.
to_json
()
->
writeto_fpath
(
output_file
(
"TestOprTensorRT.Profile.FromProfiler.json"
));
auto
prof_obj
=
*
static_cast
<
json
::
Object
*>
(
profiler
.
to_json
().
get
());
auto
record_obj
=
*
static_cast
<
json
::
Object
*>
(
prof_obj
[
"opr_internal_pf"
].
get
());
auto
opr_prof_arr
=
*
static_cast
<
json
::
Array
*>
(
record_obj
[
y2
.
node
()
->
owner_opr
()
->
id_str
()].
get
());
for
(
auto
item_arr
:
opr_prof_arr
.
get_impl
())
{
auto
layer_info_arr
=
*
static_cast
<
json
::
Array
*>
(
item_arr
.
get
());
auto
layer_time
=
*
static_cast
<
json
::
Number
*>
(
layer_info_arr
[
1
].
get
());
mgb_assert
(
layer_time
.
get_impl
()
>
0
,
"Error occured in json."
);
}
MGB_ASSERT_TENSOR_NEAR
(
host_z1
,
host_z2
,
1e-4
);
}
// Run it again after profiler is not in existance.
func
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_z1
,
host_z2
,
1e-4
);
}
TEST
(
TestOprTensorRT
,
Basic
)
{
REQUIRE_GPU
(
1
);
intl
::
SimpleTensorRTNetwork
net
;
...
...
src/tensorrt/test/tensorrt_runtime.cpp
浏览文件 @
bcbfbbd7
...
...
@@ -10,7 +10,6 @@
*/
#include "megbrain/comp_node_env.h"
#include "megbrain/plugin/profiler.h"
#include "megbrain/test/autocheck.h"
#include "megbrain/test/helper.h"
#include "megbrain/test/megdnn_helper.h"
...
...
@@ -102,69 +101,6 @@ TEST(TestOprTensorRT, ConcatRuntimeBasic) {
MGB_ASSERT_TENSOR_NEAR
(
host_z1
,
host_z2
,
1e-4
);
}
TEST
(
TestOprTensorRT
,
RuntimeProfile
)
{
REQUIRE_GPU
(
1
);
intl
::
ConcatConvTensorRTNetwork
net
;
SymbolVar
y2
;
{
auto
p
=
net
.
create_trt_network
(
false
);
TensorRTUniquePtr
<
INetworkDefinition
>
trt_net
{
p
.
second
,
{}};
TensorRTUniquePtr
<
IBuilder
>
builder
{
p
.
first
,
{}};
builder
->
setMaxBatchSize
(
5
);
#if NV_TENSOR_RT_VERSION >= 6001
TensorRTUniquePtr
<
IBuilderConfig
>
build_config
{
builder
->
createBuilderConfig
()};
auto
cuda_engine
=
builder
->
buildEngineWithConfig
(
*
trt_net
,
*
build_config
);
#else
auto
cuda_engine
=
builder
->
buildCudaEngine
(
*
trt_net
);
#endif
TensorRTUniquePtr
<
IHostMemory
>
mem
{
cuda_engine
->
serialize
(),
{}};
FILE
*
fout
=
fopen
(
output_file
(
"trt_cuda_engine"
).
c_str
(),
"wb"
);
auto
wr
=
fwrite
(
mem
->
data
(),
1
,
mem
->
size
(),
fout
);
mgb_assert
(
wr
==
mem
->
size
());
fclose
(
fout
);
y2
=
TensorRTRuntimeOpr
::
make
(
TensorRTRuntimeOpr
::
to_shared_ptr_engine
(
cuda_engine
),
{},
{
net
.
x0
,
net
.
x1
})[
0
];
}
HostTensorND
host_z1
;
HostTensorND
host_z2
;
auto
func
=
net
.
graph
->
compile
({
make_callback_copy
(
net
.
y
,
host_z1
),
make_callback_copy
(
y2
,
host_z2
)});
{
mgb
::
GraphProfiler
profiler
(
net
.
graph
.
get
());
func
->
execute
();
profiler
.
to_json
()
->
writeto_fpath
(
output_file
(
"TestOprTensorRT.RuntimeProfile.FromProfiler.json"
));
auto
prof_obj
=
*
static_cast
<
json
::
Object
*>
(
profiler
.
to_json
().
get
());
auto
record_obj
=
*
static_cast
<
json
::
Object
*>
(
prof_obj
[
"opr_internal_pf"
].
get
());
auto
opr_prof_arr
=
*
static_cast
<
json
::
Array
*>
(
record_obj
[
y2
.
node
()
->
owner_opr
()
->
id_str
()].
get
());
for
(
auto
item_arr
:
opr_prof_arr
.
get_impl
())
{
auto
layer_info_arr
=
*
static_cast
<
json
::
Array
*>
(
item_arr
.
get
());
auto
layer_time
=
*
static_cast
<
json
::
Number
*>
(
layer_info_arr
[
1
].
get
());
mgb_assert
(
layer_time
.
get_impl
()
>
0
,
"Error occured in json."
);
}
MGB_ASSERT_TENSOR_NEAR
(
host_z1
,
host_z2
,
1e-4
);
}
// Run it again after profiler is not in existance.
func
->
execute
();
MGB_ASSERT_TENSOR_NEAR
(
host_z1
,
host_z2
,
1e-4
);
}
TEST
(
TestOprTensorRT
,
RuntimeChangeBatchSize
)
{
REQUIRE_GPU
(
1
);
intl
::
SimpleTensorRTNetwork
net
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录