Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
d2bbbb30
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
d2bbbb30
编写于
12月 08, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mge/profiler): add overall model performance metrics
GitOrigin-RevId: 1394dedc265f898a79cca236475eb7d680c58950
上级
61f3b2e6
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
228 addition
and
6 deletion
+228
-6
imperative/python/megengine/utils/profiler.py
imperative/python/megengine/utils/profiler.py
+8
-0
imperative/python/src/tensor.cpp
imperative/python/src/tensor.cpp
+4
-0
imperative/src/impl/interpreter/commands.h
imperative/src/impl/interpreter/commands.h
+8
-1
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+11
-0
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+1
-0
imperative/src/impl/profiler.cpp
imperative/src/impl/profiler.cpp
+6
-0
imperative/src/impl/profiler/chrome_timeline.cpp
imperative/src/impl/profiler/chrome_timeline.cpp
+42
-0
imperative/src/impl/profiler/events.h
imperative/src/impl/profiler/events.h
+2
-0
imperative/src/impl/profiler/states.h
imperative/src/impl/profiler/states.h
+141
-5
imperative/src/include/megbrain/imperative/interpreter.h
imperative/src/include/megbrain/imperative/interpreter.h
+1
-0
imperative/src/include/megbrain/imperative/profiler.h
imperative/src/include/megbrain/imperative/profiler.h
+4
-0
未找到文件。
imperative/python/megengine/utils/profiler.py
浏览文件 @
d2bbbb30
...
...
@@ -18,6 +18,7 @@ from ..core._imperative_rt.core2 import (
set_python_backtrace_enabled
,
start_profile
,
stop_profile
,
stop_step
,
sync
,
)
from
..logger
import
get_logger
...
...
@@ -134,6 +135,13 @@ class Profiler(ContextDecorator):
_living_profilers
.
add
(
self
)
set_python_backtrace_enabled
(
self
.
_origin_enable_bt
)
def
step
(
self
):
global
_running_profiler
assert
_running_profiler
is
not
None
stop_step
()
return
self
def
dump
(
self
):
if
self
.
_dump_callback
is
not
None
:
if
not
os
.
path
.
exists
(
self
.
_path
):
...
...
imperative/python/src/tensor.cpp
浏览文件 @
d2bbbb30
...
...
@@ -966,6 +966,10 @@ void init_tensor(py::module m) {
results
=
nullptr
;
};
});
m
.
def
(
"stop_step"
,
[
channel
]()
{
imperative
::
Profiler
::
stop_step
();
channel
->
stop_step
();
});
m
.
def
(
"enable_cupti"
,
&
cupti
::
enable
);
m
.
def
(
"disable_cupti"
,
&
cupti
::
disable
);
m
.
def
(
"cupti_available"
,
&
cupti
::
available
);
...
...
imperative/src/impl/interpreter/commands.h
浏览文件 @
d2bbbb30
...
...
@@ -116,6 +116,13 @@ struct StopProfile {
const
char
*
get_name
()
const
{
return
"StopProfile"
;
}
};
struct
StopStep
{
template
<
typename
TFunctor
>
void
get_props
(
TFunctor
&&
functor
)
const
{}
const
char
*
get_name
()
const
{
return
"StopStep"
;
}
};
struct
PushScope
{
std
::
string
scope_name
;
...
...
@@ -162,7 +169,7 @@ struct StopRegen {
using
CommandData
=
std
::
variant
<
Put
,
ApplyOp
,
Del
,
GetValue
,
Drop
,
SetOption
,
StartProfile
,
StopProfile
,
PushScope
,
PopScope
,
StartRegen
,
StopRegen
>
;
StopStep
,
PushScope
,
PopScope
,
StartRegen
,
StopRegen
>
;
struct
Command
{
uint64_t
id
;
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
d2bbbb30
...
...
@@ -1339,6 +1339,8 @@ void ChannelImpl::process_one_task(Command& icmd) {
CompNode
::
foreach
(
[
&
](
CompNode
device
)
{
sample_on_device
(
device
,
true
);
});
MGB_RECORD_EVENT
(
StopProfileFinishEvent
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
StopStep
>
)
{
MGB_RECORD_EVENT
(
StopStepEvent
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
PushScope
>
)
{
MGB_RECORD_EVENT
(
ScopeEvent
,
cmd
.
scope_name
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
PopScope
>
)
{
...
...
@@ -1436,6 +1438,15 @@ void ChannelImpl::stop_profile() {
}
}
void
ChannelImpl
::
stop_step
()
{
MGB_LOCK_GUARD
(
m_spin
);
assert_available
();
mgb_assert
(
Profiler
::
is_profiling
()
==
true
,
"Profiler isn't profiling!"
);
m_worker
.
add_task
(
{
Profiler
::
next_id
(),
StopStep
{},
get_channel_state
().
stack_manager
.
dump
()});
}
void
ChannelImpl
::
push_scope
(
std
::
string
name
)
{
MGB_LOCK_GUARD
(
m_spin
);
assert_available
();
...
...
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
d2bbbb30
...
...
@@ -63,6 +63,7 @@ struct ChannelImpl : Interpreter::Channel, NonCopyableObj, NonMoveableObj {
void
start_profile
()
override
;
void
stop_profile
()
override
;
void
stop_step
()
override
;
void
push_scope
(
std
::
string
)
override
;
void
pop_scope
(
std
::
string
)
override
;
...
...
imperative/src/impl/profiler.cpp
浏览文件 @
d2bbbb30
...
...
@@ -14,6 +14,7 @@
#include "./op_trait.h"
#include "./profiler/formats.h"
#include "./profiler/states.h"
namespace
mgb
{
namespace
imperative
{
...
...
@@ -54,6 +55,11 @@ void Profiler::stop_profile() {
sm_profiling
=
false
;
}
void
Profiler
::
stop_step
()
{
mgb_assert
(
sm_profiling
);
MGB_RECORD_EVENT
(
profiler
::
StopStepEvent
);
}
auto
Profiler
::
get_thread_dict
()
->
thread_dict_t
{
thread_dict_t
thread_dict
;
for
(
auto
&&
[
tid
,
profiler
]
:
sm_profilers
)
{
...
...
imperative/src/impl/profiler/chrome_timeline.cpp
浏览文件 @
d2bbbb30
...
...
@@ -151,6 +151,12 @@ public:
return
m_content
.
back
();
}
ChromeTraceEvent
&
new_event_front
()
{
ChromeTraceEvent
new_event
;
m_content
.
insert
(
m_content
.
begin
(),
new_event
);
return
m_content
.
front
();
}
std
::
string
&
metadata
(
std
::
string
key
)
{
return
m_metadata
[
key
];
}
nlohmann
::
json
to_json
()
const
{
...
...
@@ -196,6 +202,12 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
since_start
(
time
));
}
ChromeTraceEvent
&
new_event_front
(
std
::
string
name
,
char
ph
,
size_t
tid
,
profiler
::
HostTime
time
)
{
return
trace_events
.
new_event_front
().
name
(
name
).
ph
(
ph
).
pid
(
pid
).
tid
(
tid
).
ts
(
since_start
(
time
));
}
ChromeTraceEvent
&
new_host_event
(
std
::
string
name
,
char
ph
)
{
return
trace_events
.
new_event
()
.
name
(
name
)
...
...
@@ -205,6 +217,17 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
.
ts
(
since_start
(
current
->
time
));
}
ChromeTraceEvent
&
new_host_event
(
std
::
string
name
,
char
ph
,
int
tid
,
profiler
::
Duration
time
)
{
return
trace_events
.
new_event
().
name
(
name
).
ph
(
ph
).
pid
(
pid
).
tid
(
tid
).
ts
(
time
);
}
ChromeTraceEvent
&
new_host_event_front
(
std
::
string
name
,
char
ph
,
int
tid
,
profiler
::
Duration
time
)
{
return
trace_events
.
new_event_front
().
name
(
name
).
ph
(
ph
).
pid
(
pid
).
tid
(
tid
).
ts
(
time
);
}
ChromeTraceEvent
&
new_cupti_event
(
std
::
string
name
,
char
ph
,
cupti
::
stream_t
stream
,
cupti
::
time_point
timestamp
)
{
...
...
@@ -363,6 +386,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
new_host_event
(
"StopProfile"
,
'B'
);
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
StopProfileFinishEvent
>
)
{
new_host_event
(
"StopProfile"
,
'E'
);
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
StopStepEvent
>
)
{
new_host_event
(
"StopStep"
,
'i'
);
}
else
if
constexpr
(
std
::
is_same_v
<
TEvent
,
CustomEvent
>
)
{
new_host_event
(
event
.
title
,
'B'
);
if
(
event
.
device
.
valid
())
{
...
...
@@ -498,6 +523,23 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> {
void
dump_chrome_timeline
(
std
::
string
filename
,
Profiler
::
bundle_t
result
)
{
ChromeTimelineEventVisitor
visitor
{};
visitor
.
process_events
(
result
);
auto
gpu_usage_ratio
=
(
double
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
visitor
.
gpu_usage_time
()).
count
()
*
100
)
/
(
double
)(
visitor
.
get_total_train_time
().
count
());
visitor
.
new_host_event
(
"gpu_usgae_ratio"
,
'C'
,
0
,
visitor
.
last_kernel_finish
())
.
arg
(
"value"
,
gpu_usage_ratio
);
visitor
.
new_host_event_front
(
"gpu_usgae_ratio"
,
'C'
,
0
,
std
::
chrono
::
microseconds
(
0
))
.
arg
(
"value"
,
gpu_usage_ratio
);
auto
train_time_ratio
=
(
double
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
visitor
.
gpu_usage_time_with_gap
()).
count
()
*
100
)
/
(
double
)(
visitor
.
get_total_train_time
().
count
());
visitor
.
new_host_event
(
"train_time_ratio"
,
'C'
,
0
,
visitor
.
last_kernel_finish
())
.
arg
(
"value"
,
train_time_ratio
);
visitor
.
new_host_event_front
(
"train_time_ratio"
,
'C'
,
0
,
std
::
chrono
::
microseconds
(
0
))
.
arg
(
"value"
,
train_time_ratio
);
visitor
.
name_threads
(
result
.
thread_dict
);
auto
trace_events
=
std
::
move
(
visitor
.
trace_events
);
trace_events
.
metadata
(
"localTime"
)
=
...
...
imperative/src/impl/profiler/events.h
浏览文件 @
d2bbbb30
...
...
@@ -172,6 +172,8 @@ DEF_DUR_EVENT(HostToDevice, {
void
*
device_ptr
;
});
DEF_EVENT
(
StopStep
,
{
CompNode
device
;
});
// cupti events
DEF_EVENT
(
CUPTITimestamp
,
{
cupti
::
clock
::
time_point
timestamp
;
});
...
...
imperative/src/impl/profiler/states.h
浏览文件 @
d2bbbb30
#pragma once
#include <algorithm>
#include <chrono>
#include <ctime>
#include <any>
#include <set>
#include <sstream>
...
...
@@ -187,6 +191,18 @@ private:
std
::
unordered_map
<
std
::
string
,
int64_t
>
m_counter_table
;
std
::
optional
<
std
::
pair
<
profiler
::
HostTime
,
cupti
::
time_point
>>
m_cupti_timestamp
=
{};
// Record the start and end time of all kernels
std
::
vector
<
std
::
vector
<
profiler
::
HostTime
>>
m_kernel_start_finish_time
;
// The sum of all kernel execution times
profiler
::
Duration
m_gpu_usage_time
=
std
::
chrono
::
microseconds
(
0
);
// The sum of all kernel execution times, including the gap time between kernels
// within a step
profiler
::
Duration
m_gpu_usage_time_with_gap
=
std
::
chrono
::
microseconds
(
0
);
// Record the end time of each step
std
::
vector
<
profiler
::
HostTime
>
m_step_finish_time
;
// Record the start time of the first kernel and the end time of the last kernel in
// each step
std
::
vector
<
std
::
vector
<
profiler
::
HostTime
>>
m_step_first_last_kernel
;
protected:
Profiler
::
Record
*
current
;
...
...
@@ -288,9 +304,9 @@ public:
SampleDeviceEvent
,
SampleDeviceFinishEvent
,
WorkerExceptionEvent
,
ShapeInferEvent
,
SyncEvent
,
SyncFinishEvent
,
StartProfileEvent
,
StartProfileFinishEvent
,
StopProfileEvent
,
StopProfileFinishEvent
,
TensorCommandEvent
,
TensorCommandFinishEvent
,
AutoEvict
Event
,
AutoEvict
FinishEvent
,
CustomEvent
,
CustomFinishEvent
,
RecordDevice
Event
,
ScopeEvent
,
ScopeFinishEvent
,
HostToDeviceEvent
,
StopStepEvent
,
TensorCommandEvent
,
TensorCommandFinish
Event
,
AutoEvict
Event
,
AutoEvictFinishEvent
,
CustomEvent
,
CustomFinish
Event
,
RecordDeviceEvent
,
ScopeEvent
,
ScopeFinishEvent
,
HostToDeviceEvent
,
HostToDeviceFinishEvent
,
CUPTITimestampEvent
,
CUPTIKernelLaunchEvent
,
CUPTIKernelLaunchFinishEvent
,
CUPTIKernelExecuteEvent
,
CUPTIMemcpyLaunchEvent
,
CUPTIMemcpyLaunchFinishEvent
,
CUPTIMemcpyEvent
,
...
...
@@ -335,6 +351,15 @@ public:
}
});
// record step end time
for_each_entry
([
&
](
auto
&&
event
)
{
using
T
=
std
::
decay_t
<
decltype
(
event
)
>
;
if
constexpr
(
std
::
is_same_v
<
T
,
StopStepEvent
>
)
{
auto
step_time
=
current
->
time
;
m_step_finish_time
.
push_back
(
to_device_time
(
step_time
,
event
.
device
));
}
});
// register host threads
for_each_entry
([
&
](
auto
&&
event
)
{
if
(
!
m_host_tid_table
.
count
(
current
->
tid
))
{
...
...
@@ -443,9 +468,17 @@ public:
inc_counter
(
"nr_exception"
,
1
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
KernelLaunchFinishEvent
>
)
{
auto
&
execution
=
current_op
->
executions
.
back
();
if
(
execution
.
reason
==
"dtr"
)
{
auto
overhead
=
to_device_time
(
current
->
time
,
event
.
device
)
-
to_device_time
(
execution
.
begin
,
event
.
device
);
std
::
vector
<
profiler
::
HostTime
>
current_kernel_start_finish
;
current_kernel_start_finish
.
emplace_back
(
to_device_time
(
execution
.
begin
,
event
.
device
));
current_kernel_start_finish
.
emplace_back
(
to_device_time
(
current
->
time
,
event
.
device
));
m_kernel_start_finish_time
.
emplace_back
(
current_kernel_start_finish
);
if
(
execution
.
reason
==
"dtr"
)
{
inc_counter
(
"dtr_overhead_us"
,
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
...
...
@@ -463,6 +496,109 @@ public:
}
});
}
profiler
::
Duration
last_kernel_finish
()
{
if
(
m_kernel_start_finish_time
.
size
()
==
0
)
{
return
profiler
::
Duration
::
zero
();
}
return
m_kernel_start_finish_time
.
back
()[
1
]
-
m_start_time
;
}
// get GPU busy time (union of calculation time and communication time)
profiler
::
Duration
gpu_usage_time
()
{
if
(
m_kernel_start_finish_time
.
size
()
==
0
)
{
return
profiler
::
Duration
::
zero
();
}
std
::
sort
(
m_kernel_start_finish_time
.
begin
(),
m_kernel_start_finish_time
.
end
(),
[
&
](
std
::
vector
<
profiler
::
HostTime
>
kernel1
,
std
::
vector
<
profiler
::
HostTime
>
kernel2
)
{
if
(
kernel1
[
0
]
!=
kernel2
[
0
])
{
return
kernel1
[
0
]
<
kernel2
[
0
];
}
return
kernel1
[
1
]
<
kernel2
[
1
];
});
HostTime
current_start
=
profiler
::
HostTime
::
min
();
HostTime
current_end
=
profiler
::
HostTime
::
min
();
for
(
size_t
i
=
0
;
i
<
m_kernel_start_finish_time
.
size
();
++
i
)
{
if
(
current_start
==
profiler
::
HostTime
::
min
())
{
current_start
=
m_kernel_start_finish_time
[
i
][
0
];
current_end
=
m_kernel_start_finish_time
[
i
][
1
];
}
else
if
(
current_end
<
m_kernel_start_finish_time
[
i
][
0
])
{
m_gpu_usage_time
+=
profiler
::
Duration
(
current_end
-
current_start
);
current_start
=
m_kernel_start_finish_time
[
i
][
0
];
current_end
=
m_kernel_start_finish_time
[
i
][
1
];
}
else
if
(
current_end
>
m_kernel_start_finish_time
[
i
][
0
])
{
current_end
=
max
(
current_end
,
m_kernel_start_finish_time
[
i
][
1
]);
}
}
m_gpu_usage_time
+=
profiler
::
Duration
(
current_end
-
current_start
);
return
m_gpu_usage_time
;
}
// compared to gpu_usage_time, this method adds gap time between kernels in the same
// step
profiler
::
Duration
gpu_usage_time_with_gap
()
{
if
(
m_step_finish_time
.
empty
())
{
return
profiler
::
Duration
::
zero
();
}
std
::
sort
(
m_step_finish_time
.
begin
(),
m_step_finish_time
.
end
(),
[
&
](
HostTime
time1
,
HostTime
time2
)
{
return
time1
<
time2
;
});
std
::
sort
(
m_kernel_start_finish_time
.
begin
(),
m_kernel_start_finish_time
.
end
(),
[
&
](
std
::
vector
<
profiler
::
HostTime
>
kernel1
,
std
::
vector
<
profiler
::
HostTime
>
kernel2
)
{
if
(
kernel1
[
0
]
!=
kernel2
[
0
])
{
return
kernel1
[
0
]
<
kernel2
[
0
];
}
return
kernel1
[
1
]
<
kernel2
[
1
];
});
int
cur_step
=
0
;
auto
kernel_num
=
m_kernel_start_finish_time
.
size
();
for
(
size_t
i
=
0
;
i
<
kernel_num
;
++
i
)
{
// Record the start time of the first kernel and the end time of the last
// kernel of the current step
std
::
vector
<
profiler
::
HostTime
>
step_begin_end_time
;
step_begin_end_time
.
emplace_back
(
m_kernel_start_finish_time
[
i
][
0
]);
size_t
j
=
i
;
while
(
j
<
kernel_num
&&
m_kernel_start_finish_time
[
j
][
0
]
<
m_step_finish_time
[
cur_step
+
1
])
{
++
j
;
}
step_begin_end_time
.
emplace_back
(
m_kernel_start_finish_time
[
j
-
1
][
1
]);
mgb_assert
(
step_begin_end_time
.
size
()
==
2
,
"step_begin_end_time.size() should be exactly 2!"
);
m_step_first_last_kernel
.
emplace_back
(
step_begin_end_time
);
i
=
j
-
1
;
++
cur_step
;
}
for
(
size_t
i
=
0
;
i
<
m_step_first_last_kernel
.
size
();
++
i
)
{
m_gpu_usage_time_with_gap
+=
profiler
::
Duration
(
m_step_first_last_kernel
[
i
][
1
]
-
m_step_first_last_kernel
[
i
][
0
]);
}
return
m_gpu_usage_time_with_gap
;
}
// from the start time of the first kernel of the first step to the end time of the
// last kernel of the last step
std
::
chrono
::
microseconds
get_total_train_time
()
{
if
(
m_kernel_start_finish_time
.
size
()
==
0
)
{
return
std
::
chrono
::
microseconds
(
1
);
}
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
m_kernel_start_finish_time
.
back
()[
1
]
-
m_kernel_start_finish_time
.
front
()[
0
]);
}
};
}
// namespace mgb::imperative::profiler
imperative/src/include/megbrain/imperative/interpreter.h
浏览文件 @
d2bbbb30
...
...
@@ -61,6 +61,7 @@ struct Interpreter {
virtual
void
start_profile
()
=
0
;
virtual
void
stop_profile
()
=
0
;
virtual
void
stop_step
()
=
0
;
virtual
void
push_scope
(
std
::
string
name
)
=
0
;
virtual
void
pop_scope
(
std
::
string
name
)
=
0
;
...
...
imperative/src/include/megbrain/imperative/profiler.h
浏览文件 @
d2bbbb30
...
...
@@ -37,6 +37,7 @@ class Timer {
public:
using
Time
=
profiler
::
Time
;
static
profiler
::
Time
record_host
();
static
profiler
::
Time
record_device_time
(
CompNode
device
);
static
std
::
shared_ptr
<
CompNode
::
Event
>
record_device
(
CompNode
device
);
};
...
...
@@ -77,6 +78,7 @@ private:
std
::
atomic
<
Status
>
m_status
=
Running
;
static
std
::
vector
<
entry_t
>
sm_records
;
static
std
::
vector
<
profiler
::
HostTime
>
sm_step_time
;
static
options_t
sm_profile_options
;
static
std
::
mutex
sm_mutex
;
// assume std::thread::id is unique
...
...
@@ -187,6 +189,8 @@ public:
static
void
stop_profile
();
static
void
stop_step
();
static
thread_dict_t
get_thread_dict
();
static
void
dump_profile
(
std
::
string
basename
,
std
::
string
format
,
bundle_t
result
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录