Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
5b4f7c5d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
397
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5b4f7c5d
编写于
8月 02, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(interpreter): unwind ops with make_forward_graph
GitOrigin-RevId: 5fb8c85089f507d31a7e4d8552089660c068a4ad
上级
5798f6ce
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
71 addition
and
33 deletion
+71
-33
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+67
-33
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+4
-0
未找到文件。
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
5b4f7c5d
...
@@ -137,8 +137,11 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) {
...
@@ -137,8 +137,11 @@ TensorInfo* ChannelImpl::put_impl(const HostTensorND& value, bool no_cache) {
Handle
ChannelImpl
::
put
(
const
DeviceTensorND
&
data
,
const
HostTensorND
&
hvalue
)
{
Handle
ChannelImpl
::
put
(
const
DeviceTensorND
&
data
,
const
HostTensorND
&
hvalue
)
{
MGB_LOCK_GUARD
(
m_spin
);
MGB_LOCK_GUARD
(
m_spin
);
auto
&
state
=
get_channel_state
();
mgb_assert
(
check_available
(),
"Channel already closed"
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
return
put_impl
(
data
,
hvalue
);
}
TensorInfo
*
ChannelImpl
::
put_impl
(
const
DeviceTensorND
&
data
,
const
HostTensorND
&
hvalue
)
{
auto
&
state
=
get_channel_state
();
state
.
scopes
.
push
(
"Put"
);
state
.
scopes
.
push
(
"Put"
);
auto
info
=
alloc
();
auto
info
=
alloc
();
RECORD_EVENT
(
TensorCommandEvent
,
info
->
id
,
TensorCommandEvent
::
Put
);
RECORD_EVENT
(
TensorCommandEvent
,
info
->
id
,
TensorCommandEvent
::
Put
);
...
@@ -335,6 +338,12 @@ SmallVector<Handle> ChannelImpl::apply_op(
...
@@ -335,6 +338,12 @@ SmallVector<Handle> ChannelImpl::apply_op(
const
SmallVector
<
Handle
>&
inputs
)
{
const
SmallVector
<
Handle
>&
inputs
)
{
MGB_LOCK_GUARD
(
m_spin
);
MGB_LOCK_GUARD
(
m_spin
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
mgb_assert
(
check_available
(),
"Channel already closed"
);
return
apply_op_impl
(
std
::
move
(
op
),
inputs
);
}
SmallVector
<
Handle
>
ChannelImpl
::
apply_op_impl
(
std
::
shared_ptr
<
OpDef
>
op
,
const
SmallVector
<
Handle
>&
inputs
)
{
auto
&
state
=
get_channel_state
();
auto
&
state
=
get_channel_state
();
for
(
auto
i
:
inputs
)
{
for
(
auto
i
:
inputs
)
{
mgb_assert
(
m_valid_handle
.
find
(
i
)
!=
m_valid_handle
.
end
(),
mgb_assert
(
m_valid_handle
.
find
(
i
)
!=
m_valid_handle
.
end
(),
...
@@ -610,8 +619,12 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
...
@@ -610,8 +619,12 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
auto
&
state
=
get_worker_state
();
auto
&
state
=
get_worker_state
();
bool
profiling_device
=
Profiler
::
is_profiling
()
&&
Profiler
::
get_option
(
"profile_device"
,
0
);
bool
profiling_device
=
Profiler
::
is_profiling
()
&&
Profiler
::
get_option
(
"profile_device"
,
0
);
uint64_t
apply_id
=
cmd
.
id
;
uint64_t
apply_id
=
cmd
.
id
;
SmallVector
<
TensorPtr
>
tensor_inputs
;
struct
TensorWithDesc
{
SmallVector
<
MemoryDesc
>
input_memory_desc
;
TensorPtr
tensor
;
MemoryDesc
desc
;
};
SmallVector
<
TensorWithDesc
>
inputs
;
// SmallVector<TensorPtr> tensor_inputs;
if
(
state
.
options
.
enable_dtr_auto_drop
)
{
if
(
state
.
options
.
enable_dtr_auto_drop
)
{
m_dtr
.
pin
(
cmd
.
inputs
);
m_dtr
.
pin
(
cmd
.
inputs
);
}
}
...
@@ -621,33 +634,59 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
...
@@ -621,33 +634,59 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
}
m_dtr
.
update_used_time
(
i
);
m_dtr
.
update_used_time
(
i
);
}
}
tensor_inputs
.
reserve
(
cmd
.
inputs
.
size
());
// tensor_inputs.reserve(cmd.inputs.size());
inputs
.
reserve
(
cmd
.
inputs
.
size
());
// refcnt == 1, owners: [TensorInfo::ptr]
// refcnt == 1, owners: [TensorInfo::ptr]
for
(
auto
i
:
cmd
.
inputs
)
{
for
(
auto
i
:
cmd
.
inputs
)
{
mgb_assert
(
i
->
ptr
,
"Invalid input tensor ptr!"
);
mgb_assert
(
i
->
ptr
,
"Invalid input tensor ptr!"
);
mgb_assert
(
i
->
mem_desc
.
id
,
"Invalid input tensor mem desc!"
);
// refcnt ++, owners: [i->ptr, tensor_inputs]
// refcnt ++, owners: [i->ptr, tensor_inputs]
tensor_inputs
.
push_back
(
i
->
ptr
);
//
tensor_inputs.push_back(i->ptr);
input
_memory_desc
.
push_back
(
i
->
mem_desc
);
input
s
.
push_back
({
i
->
ptr
,
i
->
mem_desc
}
);
}
}
if
(
state
.
options
.
enable_dtr_auto_drop
&&
state
.
options
.
dtr_eviction_threshold
>
0
)
{
if
(
state
.
options
.
enable_dtr_auto_drop
&&
state
.
options
.
dtr_eviction_threshold
>
0
)
{
auto_evict
(
0
);
auto_evict
(
0
);
}
}
auto
[
outputs_mem_desc
,
tensor_outputs
,
workspaces
]
=
init_output_and_workspace
(
*
cmd
.
op
,
tensor_inputs
,
input_memory_desc
);
auto
apply_on_physical_tensor
=
[
&
](
auto
&&
self
,
const
OpDef
&
def
,
SmallVector
<
TensorWithDesc
>
inputs
)
->
SmallVector
<
TensorWithDesc
>
{
if
(
outputs_mem_desc
.
size
())
{
auto
apply_functor
=
[
&
](
std
::
shared_ptr
<
OpDef
>
op
,
SmallVector
<
TensorWithDesc
>
inputs
,
size_t
nr_outputs
)
->
SmallVector
<
TensorWithDesc
>
{
for
(
size_t
i
=
0
;
i
<
outputs_mem_desc
.
size
();
i
++
)
{
auto
opname
=
op
->
trait
()
->
make_name
(
*
op
);
if
(
cmd
.
outputs
[
i
])
{
auto
outputs
=
self
(
self
,
*
op
,
inputs
);
cmd
.
outputs
[
i
]
->
mem_desc
=
outputs_mem_desc
[
i
];
return
outputs
;
};
auto
const_functor
=
[
&
](
TensorPtr
value
)
->
TensorWithDesc
{
return
{
value
,
MemoryDesc
{
value
->
layout
(),
0
,
value
->
comp_node
(),
StorageIdentifier
::
make
()}};
};
if
(
def
.
trait
()
->
make_forward_graph
)
{
// apply recursivily
SmallVector
<
LogicalTensorDesc
>
input_descs
;
for
(
auto
&&
input
:
inputs
)
{
input_descs
.
push_back
({{{},
input
.
tensor
->
dtype
()},
input
.
tensor
->
comp_node
()});
}
auto
forward_graph
=
OpDef
::
make_forward_graph
(
def
,
input_descs
);
auto
outputs
=
forward_graph
.
apply
(
inputs
,
apply_functor
,
const_functor
);
return
outputs
;
}
}
SmallVector
<
TensorPtr
>
input_tensors
;
SmallVector
<
MemoryDesc
>
input_descs
;
// size_t next_mem_desc_id = 0;
for
(
auto
&&
input
:
inputs
)
{
input_tensors
.
push_back
(
input
.
tensor
);
input_descs
.
push_back
(
input
.
desc
);
}
}
auto
[
output_descs
,
output_tensors
,
workspaces
]
=
init_output_and_workspace
(
def
,
input_tensors
,
input_descs
);
if
(
!
output_descs
.
empty
())
{
OpDef
::
execute
(
def
,
input_tensors
,
output_tensors
,
workspaces
);
}
else
{
}
else
{
// fail to infer mem plan
output_tensors
=
OpDef
::
apply_on_physical_tensor
(
def
,
input_tensors
);
for
(
auto
&&
out
:
cmd
.
outputs
)
{
for
(
auto
&&
output_tensor
:
output_tensors
)
{
if
(
out
)
{
output_descs
.
push_back
(
MemoryDesc
{
output_tensor
->
layout
(),
0
,
output_tensor
->
comp_node
(),
StorageIdentifier
::
make
()});
out
->
mem_desc
.
id
=
StorageIdentifier
::
make
();
}
}
}
}
SmallVector
<
TensorWithDesc
>
outputs
;
for
(
auto
&&
[
output_tensor
,
output_desc
]
:
ranges
::
zip_view
(
output_tensors
,
output_descs
))
{
outputs
.
push_back
({
output_tensor
,
output_desc
});
}
}
return
outputs
;
};
RECORD_EVENT
(
OpExecuteEvent
,
apply_id
);
RECORD_EVENT
(
OpExecuteEvent
,
apply_id
);
// Begin profiling operator
// Begin profiling operator
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
...
@@ -686,20 +725,14 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
...
@@ -686,20 +725,14 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
}
// Apply op
// Apply op
// Here std::move is REQUIRED for removing duplicated references.
// Here std::move is REQUIRED for removing duplicated references.
if
(
outputs_mem_desc
.
size
())
{
auto
outputs
=
apply_on_physical_tensor
(
apply_on_physical_tensor
,
*
cmd
.
op
,
inputs
);
OpDef
::
execute
(
*
cmd
.
op
,
std
::
move
(
tensor_inputs
),
tensor_outputs
,
std
::
move
(
workspaces
));
}
else
{
tensor_outputs
=
OpDef
::
apply_on_physical_tensor
(
*
cmd
.
op
,
std
::
move
(
tensor_inputs
));
}
// After execute
// After execute
for
(
auto
&&
[
device
,
kernel_id
]
:
kernels
)
{
for
(
auto
&&
[
device
,
kernel_id
]
:
kernels
)
{
RECORD_EVENT
(
KernelExecuteFinishEvent
,
apply_id
,
kernel_id
,
Timer
::
record_event
(
device
));
RECORD_EVENT
(
KernelExecuteFinishEvent
,
apply_id
,
kernel_id
,
Timer
::
record_event
(
device
));
}
}
// End profiling operator
// End profiling operator
mgb_assert
(
tensor_
outputs
.
size
()
==
cmd
.
outputs
.
size
());
mgb_assert
(
outputs
.
size
()
==
cmd
.
outputs
.
size
());
for
(
size_t
i
=
0
;
i
<
tensor_
outputs
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
auto
output
=
cmd
.
outputs
[
i
];
auto
output
=
cmd
.
outputs
[
i
];
if
(
output
==
nullptr
)
{
if
(
output
==
nullptr
)
{
RECORD_EVENT
(
OpOutputEvent
,
0
);
RECORD_EVENT
(
OpOutputEvent
,
0
);
...
@@ -709,7 +742,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
...
@@ -709,7 +742,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
RECORD_EVENT
(
OpOutputFinishEvent
,
output
->
id
);
RECORD_EVENT
(
OpOutputFinishEvent
,
output
->
id
);
}
else
{
}
else
{
RECORD_EVENT
(
OpOutputEvent
,
output
->
id
);
RECORD_EVENT
(
OpOutputEvent
,
output
->
id
);
produce_tensor
(
output
,
tensor_outputs
[
i
]);
produce_tensor
(
output
,
outputs
[
i
].
tensor
);
output
->
mem_desc
=
outputs
[
i
].
desc
;
RECORD_EVENT
(
OpOutputFinishEvent
,
output
->
id
);
RECORD_EVENT
(
OpOutputFinishEvent
,
output
->
id
);
sample_on_device
(
output
->
desc
.
comp_node
,
false
);
sample_on_device
(
output
->
desc
.
comp_node
,
false
);
}
}
...
@@ -720,8 +754,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
...
@@ -720,8 +754,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
for
(
auto
i
:
cmd
.
inputs
)
{
for
(
auto
i
:
cmd
.
inputs
)
{
estimate_compute_time
+=
i
->
memory
;
estimate_compute_time
+=
i
->
memory
;
}
}
for
(
auto
i
:
tensor_
outputs
)
{
for
(
auto
i
:
outputs
)
{
estimate_compute_time
+=
i
->
blob
()
->
size
();
estimate_compute_time
+=
i
.
tensor
->
blob
()
->
size
();
}
}
m_dtr
.
estimate_timestamp
+=
estimate_compute_time
/
1e8
;
m_dtr
.
estimate_timestamp
+=
estimate_compute_time
/
1e8
;
for
(
auto
i
:
cmd
.
outputs
)
{
for
(
auto
i
:
cmd
.
outputs
)
{
...
@@ -751,7 +785,7 @@ void ChannelImpl::recompute(TensorInfo::ComputePath* path) {
...
@@ -751,7 +785,7 @@ void ChannelImpl::recompute(TensorInfo::ComputePath* path) {
}
}
}
}
bool
ChannelImpl
::
auto_evict
(
size_t
force_num
=
0
)
{
bool
ChannelImpl
::
auto_evict
(
size_t
force_num
)
{
auto
&
state
=
get_worker_state
();
auto
&
state
=
get_worker_state
();
if
(
!
m_dtr
.
comp_node
.
valid
())
{
if
(
!
m_dtr
.
comp_node
.
valid
())
{
return
false
;
return
false
;
...
@@ -884,7 +918,7 @@ void ChannelImpl::alloc_tensor_with_evict(TensorPtr x) {
...
@@ -884,7 +918,7 @@ void ChannelImpl::alloc_tensor_with_evict(TensorPtr x) {
set_log_level
(
pre_level
);
set_log_level
(
pre_level
);
mgb_log_warn
(
"reallocating all cuda memory to alleviate fragmentation, the performance may be affected"
);
mgb_log_warn
(
"reallocating all cuda memory to alleviate fragmentation, the performance may be affected"
);
set_log_level
(
LogLevel
::
NO_LOG
);
set_log_level
(
LogLevel
::
NO_LOG
);
BlobManager
::
inst
()
->
defrag
(
x
->
blob
()
->
comp_node
());
BlobManager
::
inst
()
->
defrag
(
x
->
comp_node
());
BlobManager
::
inst
()
->
alloc_direct
(
x
->
blob
().
get
(),
x
->
blob
()
->
size
());
BlobManager
::
inst
()
->
alloc_direct
(
x
->
blob
().
get
(),
x
->
blob
()
->
size
());
}
}
});
});
...
@@ -914,7 +948,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt
...
@@ -914,7 +948,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt
for
(
size_t
i
=
0
;
i
<
desc
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
desc
.
size
();
i
++
)
{
if
(
desc
[
i
].
id
->
is_sys_alloc
())
{
if
(
desc
[
i
].
id
->
is_sys_alloc
())
{
tensors
.
push_back
(
Tensor
::
make
(
desc
[
i
].
layout
,
desc
[
i
].
cn
));
tensors
.
push_back
(
Tensor
::
make
(
desc
[
i
].
layout
,
desc
[
i
].
cn
));
if
(
!
desc
[
i
].
layout
.
is_empty
()
&&
state
.
options
.
enable_dtr_auto_drop
)
{
if
(
state
.
options
.
enable_dtr_auto_drop
&&
!
desc
[
i
].
layout
.
is_empty
()
)
{
alloc_tensor_with_evict
(
tensors
.
back
());
alloc_tensor_with_evict
(
tensors
.
back
());
}
}
}
else
if
(
desc
[
i
].
id
->
is_from_other
())
{
}
else
if
(
desc
[
i
].
id
->
is_from_other
())
{
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
5b4f7c5d
...
@@ -85,8 +85,12 @@ private:
...
@@ -85,8 +85,12 @@ private:
void
detach_users
(
TensorInfo
*
);
void
detach_users
(
TensorInfo
*
);
TensorInfo
*
put_impl
(
const
HostTensorND
&
value
,
bool
no_cache
);
TensorInfo
*
put_impl
(
const
HostTensorND
&
value
,
bool
no_cache
);
TensorInfo
*
put_impl
(
const
DeviceTensorND
&
value
,
const
HostTensorND
&
hvalue
);
void
del_impl
(
Handle
);
void
del_impl
(
Handle
);
void
sync_impl
();
void
sync_impl
();
SmallVector
<
Handle
>
apply_op_impl
(
std
::
shared_ptr
<
OpDef
>
op
,
const
SmallVector
<
Handle
>&
inputs
);
TensorPtr
wait_tensor
(
TensorInfo
*
info
,
profiler
::
TensorProp
prop
);
TensorPtr
wait_tensor
(
TensorInfo
*
info
,
profiler
::
TensorProp
prop
);
void
notify_tensor_unsafe
(
TensorInfo
*
info
);
void
notify_tensor_unsafe
(
TensorInfo
*
info
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录