Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
defbc20e
MegEngine
项目概览
MegEngine 天元
/
MegEngine
接近 2 年 前同步成功
通知
414
Star
4708
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
defbc20e
编写于
8月 06, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(profiler): fix profiler in dtr
GitOrigin-RevId: 79b5a6b52615cbda6b4017cc9d34c39d62f09bca
上级
d4c71f92
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
46 addition
and
24 deletion
+46
-24
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+11
-9
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+3
-2
imperative/src/impl/profiler/chrome_timeline.cpp
imperative/src/impl/profiler/chrome_timeline.cpp
+2
-4
imperative/src/impl/profiler/events.h
imperative/src/impl/profiler/events.h
+1
-0
imperative/src/impl/profiler/states.h
imperative/src/impl/profiler/states.h
+29
-9
未找到文件。
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
defbc20e
...
...
@@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) {
void
ChannelImpl
::
regenerate
(
TensorInfo
*
dest
)
{
if
(
dest
->
evict_type
==
EvictType
::
DROP
)
{
auto
&&
path
=
dest
->
producer
;
m_apply_stack
.
push
({
ApplyOp
{
path
->
id
,
path
->
op
,
path
->
inputs
,
path
->
outputs
,
{}},
0
,
dest
});
m_apply_stack
.
push
({
ApplyOp
{
path
->
id
,
path
->
op
,
path
->
inputs
,
path
->
outputs
,
{}},
0
,
dest
,
"dtr"
});
if
(
!
m_applying
)
flush_apply_stack
();
}
else
if
(
dest
->
evict_type
==
EvictType
::
SWAP
)
{
MGB_RECORD_EVENT
(
TensorCommandEvent
,
dest
->
id
,
TensorCommandKind
::
ReGen
);
...
...
@@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
}
}
void
ChannelImpl
::
do_apply_op
(
const
ApplyOp
&
cmd
)
{
void
ChannelImpl
::
do_apply_op
(
const
ApplyOp
&
cmd
,
std
::
string
reason
)
{
using
namespace
ranges
;
using
namespace
ranges
::
views
;
auto
&
state
=
get_worker_state
();
...
...
@@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
return
outputs
;
};
MGB_RECORD_EVENT
(
OpExecuteEvent
,
apply_id
);
MGB_RECORD_EVENT
(
OpExecuteEvent
,
apply_id
,
{},
reason
);
// Begin profiling operator
SmallVector
<
std
::
pair
<
CompNode
,
uint64_t
>>
kernels
;
if
(
profiling_device
)
{
...
...
@@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
}
m_dtr
.
unpin
(
cmd
.
inputs
);
}
MGB_RECORD_EVENT
(
OpExecuteFinishEvent
,
apply_id
);
MGB_RECORD_EVENT
(
OpExecuteFinishEvent
,
apply_id
,
{},
reason
);
// End profiling operator
}
...
...
@@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() {
m_applying
=
true
;
auto
&
state
=
get_worker_state
();
while
(
!
m_apply_stack
.
empty
())
{
auto
&
[
cmd
,
idx
,
recomp
]
=
m_apply_stack
.
top
();
// cmd.inputs[0~idx-1] is in memory
auto
&
[
cmd
,
idx
,
recomp
,
reason
]
=
m_apply_stack
.
top
();
// cmd.inputs[0~idx-1] is in memory
if
(
idx
==
0
)
{
if
(
state
.
options
.
enable_dtr_auto_drop
)
{
m_dtr
.
pin
(
cmd
.
inputs
);
...
...
@@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() {
}
if
(
regen
)
continue
;
// the required input tensors are already in memory
auto
cmd_backup
=
cmd
;
auto
recomp_backup
=
recomp
;
auto
[
cmd_backup
,
recomp_backup
,
reason_backup
]
=
std
::
make_tuple
(
cmd
,
recomp
,
reason
);
m_apply_stack
.
pop
();
do_apply_op
(
cmd_backup
);
do_apply_op
(
cmd_backup
,
reason_backup
);
if
(
recomp_backup
)
{
MGB_RECORD_EVENT
(
TensorCommandFinishEvent
,
recomp_backup
->
id
,
TensorCommandKind
::
ReGen
);
for
(
auto
o
:
cmd_backup
.
outputs
)
{
...
...
@@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) {
sample_on_device
(
m_dtr
.
comp_node
,
false
);
auto
best
=
m_dtr
.
find_best_tensor
(
state
.
options
.
enable_dtr_sqrt_sampling
&&
!
force_num
);
if
(
!
best
)
{
MGB_RECORD_EVENT
(
AutoEvictFinishEvent
);
break
;
}
if
(
best
->
ptr
.
unique
()
&&
best
->
ptr
->
blob
().
unique
())
{
...
...
@@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) {
set_log_level
(
pre_level
);
mgb_log_warn
(
"reallocating all cuda memory to alleviate fragmentation, the performance may be affected"
);
set_log_level
(
LogLevel
::
NO_LOG
);
imperative_log_profile_begin
(
"defrag"
);
BlobManager
::
inst
()
->
defrag
(
x
->
comp_node
());
imperative_log_profile_end
(
"defrag"
);
BlobManager
::
inst
()
->
alloc_direct
(
x
,
x
->
size
());
}
});
...
...
@@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
return
;
}
}
m_apply_stack
.
push
({
cmd
,
0
,
nullptr
});
m_apply_stack
.
push
({
cmd
,
0
,
nullptr
,
"cmd"
});
flush_apply_stack
();
for
(
size_t
i
=
0
;
i
<
cmd
.
outputs
.
size
();
++
i
)
{
auto
output
=
cmd
.
outputs
[
i
];
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
defbc20e
...
...
@@ -104,8 +104,8 @@ private:
void
release_tensor
(
TensorInfo
*
dest
);
void
regenerate
(
TensorInfo
*
dest
);
void
do_apply_op
(
const
ApplyOp
&
cmd
);
void
flush_apply_stack
();
void
do_apply_op
(
const
ApplyOp
&
cmd
,
std
::
string
reason
);
std
::
tuple
<
SmallVector
<
MemoryDesc
>
,
SmallVector
<
TensorPtr
>
,
SmallVector
<
TensorPtr
>>
init_output_and_workspace
(
const
OpDef
&
def
,
...
...
@@ -150,7 +150,8 @@ private:
std
::
exception_ptr
m_worker_exc
;
std
::
function
<
void
(
std
::
string
,
std
::
string
)
>
m_profile_dump_callback
;
size_t
m_storage_id
=
0
;
std
::
stack
<
std
::
tuple
<
ApplyOp
,
size_t
,
TensorInfo
*>>
m_apply_stack
;
// TODO: use explicit struct
std
::
stack
<
std
::
tuple
<
ApplyOp
,
size_t
,
TensorInfo
*
,
std
::
string
>>
m_apply_stack
;
bool
m_applying
=
false
;
bool
m_closed
=
false
;
...
...
imperative/src/impl/profiler/chrome_timeline.cpp
浏览文件 @
defbc20e
...
...
@@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
ChromeTraceEvent
&
new_host_event
(
std
::
string
name
,
char
ph
)
{
return
trace_events
.
new_event
().
name
(
name
).
ph
(
ph
).
pid
(
pid
).
tid
(
to_tid
(
current
->
tid
)).
ts
(
since_start
(
current
->
time
));
}
;
}
ChromeTraceEvent
&
new_device_event
(
std
::
string
name
,
char
ph
,
CompNode
device
)
{
using
namespace
std
::
literals
::
chrono_literals
;
auto
time
=
since_start
(
to_device_time
(
current
->
time
,
device
));
return
trace_events
.
new_event
().
name
(
name
).
ph
(
ph
).
pid
(
pid
).
tid
(
to_tid
(
device
)).
ts
(
time
);
}
;
}
const
char
*
to_cstr
(
TensorCommandKind
kind
)
{
switch
(
kind
)
{
...
...
@@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
new_host_event
(
"OpDispatch"
,
'E'
).
args
(
current_op
->
detail
());
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
OpExecuteEvent
>
)
{
mgb_assert
(
event
.
op_id
!=
0
);
current_op
->
execute_begin
=
current
->
time
;
new_host_event
(
current_op
->
name
,
'B'
);
new_host_event
(
pid_str
,
't'
)
.
cat
(
"OpDispatch"
)
.
id
(
current_op
->
id
)
.
scope
(
pid_str
);
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
OpExecuteFinishEvent
>
)
{
current_op
->
execute_end
=
current
->
time
;
new_host_event
(
current_op
->
name
,
'E'
)
.
args
(
current_op
->
detail
());
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
KernelLaunchEvent
>
)
{
...
...
imperative/src/impl/profiler/events.h
浏览文件 @
defbc20e
...
...
@@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, {
DEF_DUR_EVENT
(
OpExecute
,
{
uint64_t
op_id
;
SmallVector
<
CompNode
>
device_list
;
std
::
string
reason
;
});
DEF_DUR_EVENT
(
KernelLaunch
,
{
...
...
imperative/src/impl/profiler/states.h
浏览文件 @
defbc20e
...
...
@@ -62,8 +62,13 @@ struct ProfileOperatorState {
CompNode
device
;
Trace
trace
;
profiler
::
HostTime
execute_begin
;
profiler
::
HostTime
execute_end
;
struct
Execution
{
std
::
string
reason
;
profiler
::
HostTime
begin
;
profiler
::
HostTime
end
;
};
SmallVector
<
Execution
>
executions
;
nlohmann
::
json
detail
()
{
nlohmann
::
json
args
;
...
...
@@ -285,7 +290,7 @@ public:
op
.
outputs
=
event
.
outputs
;
op
.
trace
=
event
.
trace
;
for
(
auto
&&
output
:
event
.
outputs
)
{
m_tensors
.
at
(
output
)
.
source
=
op
.
id
;
m_tensors
[
output
]
.
source
=
op
.
id
;
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
TensorDeclareEvent
>
)
{
auto
&
tensor
=
m_tensors
[
event
.
tensor_id
];
...
...
@@ -293,7 +298,7 @@ public:
tensor
.
id
=
event
.
tensor_id
;
tensor
.
name
=
event
.
name
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
TensorProduceEvent
>
)
{
auto
&
tensor
=
m_tensors
.
at
(
event
.
tensor_id
)
;
auto
&
tensor
=
m_tensors
[
event
.
tensor_id
]
;
if
(
!
m_device_tid_table
.
count
(
event
.
device
))
{
m_device_tid_table
[
event
.
device
]
=
{
m_device_tid_table
.
size
()
+
m_host_tid_table
.
size
()};
}
...
...
@@ -308,15 +313,24 @@ public:
using
T
=
std
::
decay_t
<
decltype
(
event
)
>
;
// update current_op/tensor
if
constexpr
(
is_op_event
<
T
>::
value
)
{
current_op
=
&
m_operators
.
at
(
event
.
op_id
);
current_op
=
&
m_operators
[
event
.
op_id
];
if
(
current_op
->
id
==
0
)
{
current_op
->
id
=
event
.
op_id
;
current_op
->
name
=
"UnknownOperator"
;
}
}
else
if
constexpr
(
is_tensor_event
<
T
>::
value
)
{
mgb_assert
(
m_tensors
.
count
(
event
.
tensor_id
)
!=
0
,
"tensor not found"
);
current_tensor
=
&
m_tensors
.
at
(
event
.
tensor_id
);
current_tensor
=
&
m_tensors
[
event
.
tensor_id
];
if
(
current_tensor
->
id
==
0
)
{
current_tensor
->
id
=
event
.
tensor_id
;
current_tensor
->
name
=
"UnknownTensor"
;
}
}
if
constexpr
(
std
::
is_same_v
<
T
,
OpExecuteEvent
>
)
{
current_op
->
execute_begin
=
current
->
time
;
current_op
->
executions
.
emplace_back
();
current_op
->
executions
.
back
().
reason
=
event
.
reason
;
current_op
->
executions
.
back
().
begin
=
current
->
time
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
OpExecuteFinishEvent
>
)
{
current_op
->
execut
e_
end
=
current
->
time
;
current_op
->
execut
ions
.
back
().
end
=
current
->
time
;
}
// update counters
if
constexpr
(
std
::
is_same_v
<
T
,
OpDispatchEvent
>
)
{
...
...
@@ -337,6 +351,12 @@ public:
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
WorkerExceptionEvent
>
)
{
inc_counter
(
"nr_exception"
,
1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
KernelLaunchFinishEvent
>
)
{
auto
&
execution
=
current_op
->
executions
.
back
();
if
(
execution
.
reason
==
"dtr"
)
{
auto
overhead
=
to_device_time
(
current
->
time
,
event
.
device
)
-
to_device_time
(
execution
.
begin
,
event
.
device
);
inc_counter
(
"dtr_overhead_us"
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
overhead
).
count
());
}
}
// visit_event_impl
self
.
visit_event
(
event
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录