Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
1d64792b
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
1d64792b
编写于
5月 15, 2021
作者:
M
Megvii Engine Team
提交者:
huangxinda
7月 19, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor(profiler): detach profiler from interpreter
GitOrigin-RevId: f3954728d1dd8e93e2eb5a94ee5f3a030a54fb5a
上级
f2027b8d
变更
15
展开全部
隐藏空白更改
内联
并排
Showing
15 changed file
with
1003 addition
and
716 deletion
+1003
-716
imperative/python/megengine/utils/profiler.py
imperative/python/megengine/utils/profiler.py
+171
-41
imperative/python/src/tensor.cpp
imperative/python/src/tensor.cpp
+17
-2
imperative/python/test/integration/test_profiler.py
imperative/python/test/integration/test_profiler.py
+10
-6
imperative/src/impl/interpreter/commands.h
imperative/src/impl/interpreter/commands.h
+7
-8
imperative/src/impl/interpreter/interpreter_impl.cpp
imperative/src/impl/interpreter/interpreter_impl.cpp
+278
-195
imperative/src/impl/interpreter/interpreter_impl.h
imperative/src/impl/interpreter/interpreter_impl.h
+103
-19
imperative/src/impl/interpreter/profiler.h
imperative/src/impl/interpreter/profiler.h
+0
-93
imperative/src/impl/interpreter/tensor_info.h
imperative/src/impl/interpreter/tensor_info.h
+28
-34
imperative/src/impl/profiler.cpp
imperative/src/impl/profiler.cpp
+39
-28
imperative/src/impl/profiler/chrome_timeline.cpp
imperative/src/impl/profiler/chrome_timeline.cpp
+0
-145
imperative/src/impl/profiler/events.h
imperative/src/impl/profiler/events.h
+186
-0
imperative/src/impl/profiler/formats.h
imperative/src/impl/profiler/formats.h
+5
-15
imperative/src/impl/profiler/states.h
imperative/src/impl/profiler/states.h
+3
-0
imperative/src/include/megbrain/imperative/interpreter.h
imperative/src/include/megbrain/imperative/interpreter.h
+2
-2
imperative/src/include/megbrain/imperative/profiler.h
imperative/src/include/megbrain/imperative/profiler.h
+154
-128
未找到文件。
imperative/python/megengine/utils/profiler.py
浏览文件 @
1d64792b
...
...
@@ -7,9 +7,14 @@
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
json
from
contextlib
import
contextmanager
import
os
import
re
from
contextlib
import
ContextDecorator
,
contextmanager
from
functools
import
wraps
from
typing
import
List
from
weakref
import
WeakSet
from
..
import
_atexit
from
..core._imperative_rt.core2
import
(
pop_scope
,
push_scope
,
...
...
@@ -17,9 +22,13 @@ from ..core._imperative_rt.core2 import (
stop_profile
,
sync
,
)
from
..logger
import
get_logger
_running_profiler
=
None
_living_profilers
=
WeakSet
()
class
Profiler
:
class
Profiler
(
ContextDecorator
):
r
"""
Profile graph execution in imperative mode.
...
...
@@ -35,9 +44,10 @@ class Profiler:
from megengine.utils.profiler import Profiler
# With Learnable Parameters
profiler = Profiler()
for iter in range(0, 10):
# Only profile record of last iter would be saved
with
Profiler("profile")
:
with
profiler
:
# your code here
# Then open the profile file in chrome timeline window
...
...
@@ -45,46 +55,105 @@ class Profiler:
CHROME_TIMELINE
=
"chrome_timeline.json"
COMMAND
=
1
<<
0
OPERATOR
=
1
<<
1
TENSOR_LIFETIME
=
1
<<
2
TENSOR_PROP
=
1
<<
3
SYNC
=
1
<<
4
SCOPE
=
1
<<
5
ALL
=
(
1
<<
6
)
-
1
valid_options
=
{
"sample_rate"
:
0
,
"profile_device"
:
1
,
"num_tensor_watch"
:
10
}
valid_formats
=
{
"chrome_timeline.json"
,
"memory_flow.svg"
}
def
__init__
(
self
,
path
:
str
=
"profile"
,
format
:
str
=
CHROME_TIMELINE
,
*
,
topic
=
OPERATOR
|
SCOPE
,
align_time
=
True
,
show_operator_name
=
True
format
:
str
=
"chrome_timeline.json"
,
formats
:
List
[
str
]
=
None
,
**
kwargs
)
->
None
:
self
.
_path
=
path
self
.
_format
=
format
self
.
_options
=
{
"topic"
:
int
(
topic
),
"align_time"
:
int
(
align_time
),
"show_operator_name"
:
int
(
show_operator_name
),
}
if
not
formats
:
formats
=
[
format
]
def
__enter__
(
self
):
assert
not
isinstance
(
formats
,
str
),
"formats excepts list, got str"
for
format
in
formats
:
assert
format
in
Profiler
.
valid_formats
,
"unsupported format {}"
.
format
(
format
)
self
.
_path
=
path
self
.
_formats
=
formats
self
.
_options
=
{}
for
opt
,
optval
in
Profiler
.
valid_options
.
items
():
self
.
_options
[
opt
]
=
int
(
kwargs
.
pop
(
opt
,
optval
))
self
.
_pid
=
"<PID>"
@
property
def
path
(
self
):
if
len
(
self
.
_formats
)
==
0
:
format
=
"<FORMAT>"
elif
len
(
self
.
_formats
)
==
1
:
format
=
self
.
_formats
[
0
]
else
:
format
=
"{"
+
","
.
join
(
self
.
_formats
)
+
"}"
return
self
.
format_path
(
self
.
_path
,
self
.
_pid
,
format
)
@
property
def
directory
(
self
):
return
self
.
_path
@
property
def
formats
(
self
):
return
list
(
self
.
_formats
)
def
start
(
self
):
global
_running_profiler
assert
_running_profiler
is
None
_running_profiler
=
self
self
.
_pid
=
os
.
getpid
()
start_profile
(
self
.
_options
)
return
self
def
__exit__
(
self
,
val
,
tp
,
trace
):
stop_profile
(
self
.
_path
,
self
.
_format
)
# dump is async, so it's necessary to sync interpreter
def
stop
(
self
):
global
_running_profiler
assert
_running_profiler
is
self
_running_profiler
=
None
sync
()
self
.
_dump_callback
=
stop_profile
()
self
.
_pid
=
os
.
getpid
()
_living_profilers
.
add
(
self
)
def
dump
(
self
):
if
self
.
_dump_callback
is
not
None
:
if
not
os
.
path
.
exists
(
self
.
_path
):
os
.
makedirs
(
self
.
_path
)
if
not
os
.
path
.
isdir
(
self
.
_path
):
get_logger
().
warning
(
"{} is not a directory, cannot write profiling results"
.
format
(
self
.
_path
)
)
return
for
format
in
self
.
_formats
:
path
=
self
.
format_path
(
self
.
_path
,
self
.
_pid
,
format
)
get_logger
().
info
(
"process {} generating {}"
.
format
(
self
.
_pid
,
format
))
self
.
_dump_callback
(
path
,
format
)
get_logger
().
info
(
"profiling results written to {}"
.
format
(
path
))
self
.
_dump_callback
=
None
_living_profilers
.
remove
(
self
)
def
format_path
(
self
,
path
,
pid
,
format
):
return
os
.
path
.
join
(
path
,
"{}.{}"
.
format
(
pid
,
format
))
def
__enter__
(
self
):
self
.
start
()
def
__exit__
(
self
,
val
,
tp
,
trace
):
self
.
stop
()
def
__call__
(
self
,
func
):
def
wrapper
(
*
args
,
**
kwargs
):
with
self
:
return
func
(
*
args
,
**
kwargs
)
func
=
super
().
__call__
(
func
)
func
.
__profiler__
=
self
return
func
return
wrapper
def
__del__
(
self
):
self
.
dump
()
@
contextmanager
...
...
@@ -94,16 +163,77 @@ def scope(name):
pop_scope
(
name
)
profile
=
Profiler
def
profile
(
*
args
,
**
kwargs
):
if
len
(
args
)
==
1
and
len
(
kwargs
)
==
0
and
callable
(
args
[
0
]):
return
Profiler
()(
args
[
0
])
return
Profiler
(
*
args
,
**
kwargs
)
def
merge_trace_events
(
directory
:
str
):
names
=
filter
(
lambda
x
:
re
.
match
(
r
"\d+\.chrome_timeline\.json"
,
x
),
os
.
listdir
(
directory
)
)
def
load_trace_events
(
name
):
with
open
(
os
.
path
.
join
(
directory
,
name
),
"r"
,
encoding
=
"utf-8"
)
as
f
:
return
json
.
load
(
f
)
def
find_metadata
(
content
):
if
isinstance
(
content
,
dict
):
assert
"traceEvents"
in
content
content
=
content
[
"traceEvents"
]
if
len
(
content
)
==
0
:
return
None
assert
content
[
0
][
"name"
]
==
"Metadata"
return
content
[
0
][
"args"
]
contents
=
list
(
map
(
load_trace_events
,
names
))
metadata_list
=
list
(
map
(
find_metadata
,
contents
))
min_local_time
=
min
(
map
(
lambda
x
:
x
[
"localTime"
],
filter
(
lambda
x
:
x
is
not
None
,
metadata_list
))
)
events
=
[]
for
content
,
metadata
in
zip
(
contents
,
metadata_list
):
local_events
=
content
[
"traceEvents"
]
if
len
(
local_events
)
==
0
:
continue
local_time
=
metadata
[
"localTime"
]
time_shift
=
local_time
-
min_local_time
for
event
in
local_events
:
if
"ts"
in
event
:
event
[
"ts"
]
=
int
(
event
[
"ts"
]
+
time_shift
)
events
.
extend
(
filter
(
lambda
x
:
x
[
"name"
]
!=
"Metadata"
,
local_events
))
result
=
{
"traceEvents"
:
events
,
}
path
=
os
.
path
.
join
(
directory
,
"merge.chrome_timeline.json"
)
with
open
(
path
,
"w"
)
as
f
:
json
.
dump
(
result
,
f
,
ensure_ascii
=
False
,
separators
=
(
","
,
":"
))
get_logger
().
info
(
"profiling results written to {}"
.
format
(
path
))
def
is_profiling
():
return
_running_profiler
is
not
None
def
_stop_current_profiler
():
global
_running_profiler
if
_running_profiler
is
not
None
:
_running_profiler
.
stop
()
living_profilers
=
[
*
_living_profilers
]
for
profiler
in
living_profilers
:
profiler
.
dump
()
def
merge_trace_events
(
sources
:
List
[
str
],
target
:
str
):
names
=
list
(
map
(
lambda
x
:
x
+
".chrome_timeline.json"
,
sources
))
result
=
[]
for
name
in
names
:
with
open
(
name
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
content
=
json
.
load
(
f
)
for
entry
in
content
:
result
.
append
(
entry
)
with
open
(
target
+
".chrome_timeline.json"
,
"w"
)
as
f
:
json
.
dump
(
result
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
_atexit
(
_stop_current_profiler
)
imperative/python/src/tensor.cpp
浏览文件 @
1d64792b
...
...
@@ -13,6 +13,7 @@
#include "megbrain/common.h"
#include "megbrain/imperative/ops/utility.h"
#include "megbrain/imperative/ops/backward_graph.h"
#include "megbrain/imperative/profiler.h"
#include "megbrain/opr/io.h"
#include "./tensor.h"
...
...
@@ -927,9 +928,23 @@ void init_tensor(py::module m) {
m
.
def
(
"pop_scope"
,
[](
std
::
string
name
)
{
interpreter_for_py
->
pop_scope
(
name
);
});
m
.
def
(
"start_profile"
,
[](
std
::
unordered_map
<
std
::
string
,
int
>
option
)
{
return
interpreter_for_py
->
start_profile
(
option
);
});
[](
imperative
::
Profiler
::
options_t
options
)
{
interpreter_for_py
->
sync
();
imperative
::
Profiler
::
load_options
(
std
::
move
(
options
));
imperative
::
Profiler
::
start_profile
();
interpreter_for_py
->
start_profile
();
});
m
.
def
(
"stop_profile"
,
[](
std
::
string
basename
,
std
::
string
format
)
{
interpreter_for_py
->
stop_profile
(
basename
,
format
);
});
[]()
->
std
::
function
<
void
(
std
::
string
,
std
::
string
)
>
{
interpreter_for_py
->
stop_profile
();
interpreter_for_py
->
sync
();
imperative
::
Profiler
::
stop_profile
();
auto
results
=
imperative
::
Profiler
::
collect
();
auto
options
=
imperative
::
Profiler
::
get_options
();
return
[
results
=
std
::
move
(
results
),
options
=
std
::
move
(
options
)](
std
::
string
basename
,
std
::
string
format
){
imperative
::
Profiler
::
dump_profile
(
basename
,
format
,
results
,
options
);
};
});
m
.
def
(
"sync"
,
[]()
{
interpreter_for_py
->
sync
();
...
...
imperative/python/test/integration/test_profiler.py
浏览文件 @
1d64792b
...
...
@@ -8,6 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
import
json
import
os
import
tempfile
import
pytest
...
...
@@ -28,15 +29,18 @@ class Simple(Module):
def
test_profiler
():
profile_prefix
=
"pytest_profile"
tempdir
=
tempfile
.
NamedTemporaryFile
()
profile_prefix
=
tempdir
.
name
profile_format
=
"chrome_timeline.json"
profile_path
=
"{}.{}"
.
format
(
profile_prefix
,
profile_format
)
with
Profiler
(
profile_prefix
,
format
=
profile_format
):
with
scope
(
"my_scope"
):
oup
=
Simple
()(
tensor
([
1.23
],
dtype
=
"float32"
))
profile_path
=
os
.
path
.
join
(
profile_prefix
,
"{}.{}"
.
format
(
os
.
getpid
(),
profile_format
)
)
with
option
(
"enable_host_compute"
,
0
):
with
Profiler
(
profile_prefix
,
format
=
profile_format
):
with
scope
(
"my_scope"
):
oup
=
Simple
()(
tensor
([
1.23
],
dtype
=
"float32"
))
with
open
(
profile_path
,
"r"
)
as
f
:
events
=
json
.
load
(
f
)
os
.
remove
(
profile_path
)
prev_ts
=
{}
scope_count
=
0
for
event
in
events
:
...
...
imperative/src/impl/interpreter/commands.h
浏览文件 @
1d64792b
...
...
@@ -13,11 +13,14 @@
#include <string>
#include <variant>
#include <unordered_set>
#include "megbrain/tensor.h"
#include "megbrain/imperative/op_def.h"
#include "megbrain/imperative/utils/to_string.h"
#include "./tensor_info.h"
namespace
mgb
::
imperative
{
namespace
interpreter
::
intl
{
...
...
@@ -43,7 +46,7 @@ struct Put {
};
struct
ApplyOp
{
uint64_t
id
;
uint64_t
id
;
//used by profiler to identify unique apply
std
::
shared_ptr
<
OpDef
>
op
;
SmallVector
<
TensorInfo
*>
inputs
;
SmallVector
<
TensorInfo
*>
outputs
;
...
...
@@ -143,7 +146,7 @@ struct SetOption {
};
struct
StartProfile
{
InterpreterProfiler
*
profiler
;
std
::
unordered_set
<
TensorInfo
*>
capture_tensors
;
template
<
typename
TFunctor
>
void
get_props
(
TFunctor
&&
functor
)
const
{}
...
...
@@ -154,14 +157,10 @@ struct StartProfile {
};
struct
StopProfile
{
std
::
string
basename
;
std
::
string
format
;
std
::
unordered_set
<
TensorInfo
*>
escape_tensors
;
template
<
typename
TFunctor
>
void
get_props
(
TFunctor
&&
functor
)
const
{
functor
(
"basename"
,
basename
);
functor
(
"format"
,
format
);
}
void
get_props
(
TFunctor
&&
functor
)
const
{}
const
char
*
get_name
()
const
{
return
"StopProfile"
;
...
...
imperative/src/impl/interpreter/interpreter_impl.cpp
浏览文件 @
1d64792b
此差异已折叠。
点击以展开。
imperative/src/impl/interpreter/interpreter_impl.h
浏览文件 @
1d64792b
...
...
@@ -24,10 +24,10 @@
#include "megbrain/imperative/profiler.h"
#include "./commands.h"
#include "./events.h"
#include "./tensor_info.h"
#include "./option_manager.h"
#include "./profiler.h"
#include "../profiler/events.h"
namespace
mgb
::
imperative
::
interpreter
::
intl
{
...
...
@@ -37,7 +37,6 @@ struct InterpreterImpl : Interpreter {
std
::
unique_ptr
<
Channel
>
create_channel
()
override
;
};
struct
ChannelImpl
:
Interpreter
::
Channel
{
ChannelImpl
();
~
ChannelImpl
()
override
;
...
...
@@ -67,19 +66,27 @@ struct ChannelImpl : Interpreter::Channel {
size_t
get_option
(
std
::
string
name
)
override
;
void
set_option
(
std
::
string
name
,
size_t
value
)
override
;
void
start_profile
(
std
::
unordered_map
<
std
::
string
,
int
>
option
)
override
;
void
stop_profile
(
std
::
string
basename
,
std
::
string
format
)
override
;
void
start_profile
()
override
;
void
stop_profile
()
override
;
void
push_scope
(
std
::
string
)
override
;
void
pop_scope
(
std
::
string
)
override
;
private:
struct
WorkQueue
;
struct
State
;
TensorInfo
*
alloc
();
void
init
(
TensorInfo
*
,
LogicalTensorDesc
desc
);
void
free
(
TensorInfo
*
);
void
real_free
(
TensorInfo
*
);
void
recursive_free
(
TensorInfo
*
);
void
do_drop
(
TensorInfo
*
,
bool
);
void
detach_users
(
TensorInfo
*
);
TensorInfo
*
put_impl
(
const
HostTensorND
&
value
,
bool
no_cache
);
TensorPtr
wait_tensor
(
TensorInfo
*
info
,
profiler
::
TensorProp
prop
);
void
notify_tensor_unsafe
(
TensorInfo
*
info
);
void
process_one_task
(
IdentifiedCommand
&
);
void
check_worker_exc_unsafe
();
...
...
@@ -105,24 +112,31 @@ private:
bool
check_available
();
void
push_scope
(
std
::
string
,
State
&
);
void
pop_scope
(
std
::
string
,
State
&
);
void
assert_in_channel
();
void
assert_in_worker
();
std
::
thread
::
id
get_worker_tid
();
void
sync_device_scope
(
CompNode
device
);
template
<
typename
TCommand
>
void
enqueue_command
(
TCommand
&&
cmd
)
{
m_buffer
.
enqueue
(
Command
{
std
::
forward
<
TCommand
>
(
cmd
)});
}
void
sample_on_device
(
CompNode
device
,
bool
force
);
// valid => status != Deleted
std
::
unordered_set
<
TensorInfo
*>
collect_valid_tensors
();
std
::
mutex
m_mutex
;
std
::
condition_variable
m_cv
;
MemPool
<
TensorInfo
>
m_pool
;
std
::
unordered_set
<
Handle
>
m_valid_handle
;
TensorInfo
*
m_waitee
=
nullptr
;
uint64_t
m_waitee_id
=
0
;
std
::
exception_ptr
m_worker_exc
;
std
::
atomic_uint64_t
m_last_id
=
0
;
std
::
function
<
void
(
std
::
string
,
std
::
string
)
>
m_profile_dump_callback
;
bool
m_closed
=
false
;
...
...
@@ -191,27 +205,98 @@ private:
//! level 0: both sync.
int
m_async_level
=
2
;
struct
State
{
OptionManager
options
;
std
::
vector
<
std
::
string
>
scopes
;
std
::
unique_ptr
<
InterpreterProfiler
>
profiler
;
struct
Scope
{
std
::
string
name
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
Scope
>>
children
;
size_t
version
=
0
;
size_t
parent_version
=
0
;
size_t
tensor_count
=
0
;
Scope
*
active_child
=
nullptr
;
Scope
*
parent
=
nullptr
;
Scope
*
enter
(
std
::
string
name
)
{
auto
&
child
=
children
[
name
];
if
(
!
child
)
{
child
=
std
::
make_unique
<
Scope
>
();
child
->
name
=
name
;
child
->
parent
=
this
;
}
if
(
version
!=
child
->
parent_version
)
{
child
->
version
=
0
;
child
->
parent_version
=
version
;
}
else
{
child
->
version
++
;
}
child
->
tensor_count
=
0
;
return
active_child
=
child
.
get
();
}
State
()
{
profiler
=
std
::
make_unique
<
InterpreterProfiler
>
();
Scope
*
exit
(
std
::
string
name
)
{
mgb_assert
(
this
->
name
==
name
,
"scope name mismatch"
);
parent
->
active_child
=
nullptr
;
return
parent
;
}
};
struct
ChannelState
:
State
{};
class
ScopeManager
{
private:
Scope
m_root
;
Scope
*
m_current_scope
=
&
m_root
;
public:
class
ScopeGuard
{
private:
ScopeManager
*
m_manager
;
std
::
string
m_name
;
public:
ScopeGuard
(
ScopeManager
*
manager
,
std
::
string
name
)
:
m_manager
{
manager
},
m_name
{
name
}
{
m_manager
->
push
(
m_name
);
}
~
ScopeGuard
()
{
m_manager
->
pop
(
m_name
);
}
};
void
push
(
std
::
string
name
)
{
m_current_scope
=
m_current_scope
->
enter
(
name
);
}
void
pop
(
std
::
string
name
)
{
m_current_scope
=
m_current_scope
->
exit
(
name
);
}
std
::
string
next_tensor_name
()
{
std
::
string
builder
;
Scope
*
scope
=
&
m_root
;
while
(
true
)
{
builder
.
append
(
scope
->
name
);
if
(
scope
->
version
!=
0
)
{
builder
.
append
(
ssprintf
(
"(%ld)"
,
scope
->
version
));
}
if
(
scope
!=
&
m_root
)
{
builder
.
append
(
"."
);
}
if
(
scope
->
active_child
==
nullptr
)
{
builder
.
append
(
ssprintf
(
":%%%ld"
,
scope
->
tensor_count
++
));
break
;
}
else
{
scope
=
scope
->
active_child
;
}
}
return
builder
;
}
};
struct
WorkerState
:
State
{
struct
State
{
std
::
thread
::
id
tid
;
CompNode
::
UnorderedMap
<
std
::
vector
<
std
::
string
>>
device_scope_map
;
OptionManager
options
;
};
struct
ChannelState
:
State
{
ScopeManager
scopes
;
};
struct
WorkerState
:
State
{};
ChannelState
m_channel_state
;
WorkerState
m_worker_state
;
/*!
* \brief A framework of dynamic sublienar memory optimization
*
...
...
@@ -327,7 +412,6 @@ private:
// assert thread id when call get_xxx_state to avoid misuse
ChannelState
&
get_channel_state
();
WorkerState
&
get_worker_state
();
};
}
// namespace mgb::imperative::interpreter::intl
imperative/src/impl/interpreter/profiler.h
已删除
100644 → 0
浏览文件 @
f2027b8d
/**
* \file imperative/src/impl/interpreter/profiler.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megbrain/imperative/profiler.h"
#include "./commands.h"
#include "./events.h"
#include "./option_manager.h"
namespace
mgb
::
imperative
::
interpreter
::
intl
{
class
InterpreterProfiler
:
public
Profiler
<
CommandEnqueueEvent
,
CommandExecuteEvent
,
CommandFinishEvent
,
OpExecuteEvent
,
OpExecuteFinishEvent
,
KernelExecuteEvent
,
KernelExecuteFinishEvent
,
TensorDeclareEvent
,
TensorProduceEvent
,
TensorEraseEvent
,
TensorGetPropEvent
,
TensorWaitPropEvent
,
TensorNotifyPropEvent
,
TensorWaitPropFinishEvent
,
SyncEvent
,
SyncFinishEvent
,
ScopeEvent
,
ScopeFinishEvent
,
DeviceScopeEvent
,
DeviceScopeFinishEvent
>
{
public:
enum
Topic
{
Command
=
0b000001
,
Operator
=
0b000010
,
TensorLifetime
=
0b000100
,
TensorProp
=
0b001000
,
Sync
=
0b010000
,
Scope
=
0b100000
,
};
struct
Option
{
Topic
topic
;
bool
align_time
;
bool
show_operator_name
;
static
Option
from_dict
(
std
::
unordered_map
<
std
::
string
,
int
>
dict
)
{
Option
option
;
option
.
topic
=
Topic
(
dict
.
at
(
"topic"
));
option
.
align_time
=
bool
(
dict
.
at
(
"align_time"
));
option
.
show_operator_name
=
bool
(
dict
.
at
(
"show_operator_name"
));
return
option
;
}
};
Option
get_option
()
const
{
return
m_option
;
}
void
set_option
(
const
Option
&
option
)
{
m_option
=
option
;
}
static
Mask
topic_to_mask
(
Topic
topic
)
{
Mask
result
;
if
(
topic
&
Command
)
{
result
|=
mask_of
<
CommandEnqueueEvent
,
CommandExecuteEvent
,
CommandFinishEvent
>
();
}
if
(
topic
&
Operator
)
{
result
|=
mask_of
<
OpExecuteEvent
,
OpExecuteFinishEvent
>
();
result
|=
mask_of
<
KernelExecuteEvent
,
KernelExecuteFinishEvent
>
();
}
if
(
topic
&
TensorLifetime
)
{
result
|=
mask_of
<
TensorDeclareEvent
,
TensorProduceEvent
,
TensorEraseEvent
>
();
}
if
(
topic
&
TensorProp
)
{
result
|=
mask_of
<
TensorGetPropEvent
,
TensorWaitPropEvent
,
TensorNotifyPropEvent
,
TensorWaitPropFinishEvent
>
();
}
if
(
topic
&
Sync
)
{
result
|=
mask_of
<
SyncEvent
,
SyncFinishEvent
>
();
}
if
(
topic
&
Scope
)
{
result
|=
mask_of
<
ScopeEvent
,
ScopeFinishEvent
>
();
result
|=
mask_of
<
DeviceScopeEvent
,
DeviceScopeFinishEvent
>
();
}
return
result
;
}
private:
Option
m_option
;
};
}
imperative/src/impl/interpreter/tensor_info.h
浏览文件 @
1d64792b
...
...
@@ -27,19 +27,19 @@ enum EvictType {
/*!
* \brief an identifier to specify a component of evicted tensors
*
*
* Each component tracks the sum of the compute costs of its elements, with the
* union of two components having the sum of each constituent cost.
*/
struct
DsuNode
{
DsuNode
(
double
_t
)
:
t
(
_t
)
{}
std
::
shared_ptr
<
DsuNode
>
parent
;
bool
is_root
()
{
return
!
bool
(
parent
);
}
double
t
;
};
...
...
@@ -47,25 +47,33 @@ struct TensorInfo;
using
TensorInfoPtr
=
std
::
shared_ptr
<
TensorInfo
>
;
struct
TensorInfo
{
enum
Prop
{
Device
,
Shape
,
DType
,
DevValue
,
HostValue
enum
Status
{
InvalidStatus
,
Allocated
,
Produced
,
Swapped
,
Dropped
,
Deleted
,
};
uint64_t
id
;
uint64_t
id
=
-
1
;
std
::
string
name
;
// Most attrs of TensorInfo, except `ptr` and `h_value`,
// were visited read and written in main thread.
// Lock interpreter when visiting `ptr`.
TensorPtr
ptr
;
LogicalTensorDesc
desc
;
double
compute_time
;
size_t
memory
;
double
last_used_time
;
// FIXME: broken by drop
bool
value_fetched
=
false
;
bool
invalid
=
false
;
bool
allow_delete
=
false
;
EvictType
evict_type
=
NONE
;
// Status should be only modified in worker thread
Status
status
=
InvalidStatus
;
// Used by HostCompute and Memory Swap.
// HostCompute and Swap does not happen in one thread.
// Maybe a barrier is needed.
HostTensorND
h_value
;
// reserved for auto drop
...
...
@@ -74,6 +82,10 @@ struct TensorInfo {
size_t
ref_cnt
=
0
;
std
::
shared_ptr
<
DsuNode
>
dsu_ptr
;
// Not reference count, inc when used as input
size_t
ptr_use_count
=
0
;
// Used by `Drop` action
struct
ComputePath
{
uint64_t
id
;
std
::
shared_ptr
<
OpDef
>
op
;
...
...
@@ -111,7 +123,7 @@ struct TensorInfo {
return
path
;
}
}
*
producer
=
nullptr
;
double
eval_func
(
double
cost
,
double
free_mem
,
double
cur_time
,
double
param_cost
,
double
param_mem
,
double
param_time
,
double
param_recompute_times
)
{
return
pow
(
cost
+
1e-3
,
param_cost
)
*
pow
(
param_recompute_times
,
(
double
)
recompute_times
)
...
...
@@ -126,20 +138,24 @@ struct TensorInfo {
--
pinned
;
}
void
detach_producer
()
{
// returns true if producer is deleted
bool
detach_producer
()
{
if
(
!
producer
)
{
return
;
return
false
;
}
auto
output
=
std
::
find
(
producer
->
outputs
.
begin
(),
producer
->
outputs
.
end
(),
this
);
mgb_assert
(
output
!=
producer
->
outputs
.
end
());
*
output
=
nullptr
;
bool
deleted
=
false
;
if
(
producer
->
ref_cnt
()
==
0
)
{
for
(
auto
*
input
:
producer
->
unique_inputs
)
{
input
->
users
.
erase
(
std
::
find
(
input
->
users
.
begin
(),
input
->
users
.
end
(),
producer
));
}
delete
producer
;
deleted
=
true
;
}
producer
=
nullptr
;
return
deleted
;
}
bool
size_exceeds_thd
(
size_t
thd
)
{
...
...
@@ -150,26 +166,4 @@ struct TensorInfo {
};
}
template
<
>
struct
ToStringTrait
<
interpreter
::
intl
::
TensorInfo
::
Prop
>
{
using
TensorInfo
=
interpreter
::
intl
::
TensorInfo
;
std
::
string
operator
()(
TensorInfo
::
Prop
prop
)
const
{
switch
(
prop
)
{
case
TensorInfo
::
DType
:
return
"dtype"
;
case
TensorInfo
::
DevValue
:
return
"dev_value"
;
case
TensorInfo
::
Device
:
return
"device"
;
case
TensorInfo
::
HostValue
:
return
"host_value"
;
case
TensorInfo
::
Shape
:
return
"shape"
;
default:
return
"unknown"
;
}
}
};
}
imperative/src/impl/profiler.cpp
浏览文件 @
1d64792b
...
...
@@ -22,47 +22,58 @@
#include "./event_pool.h"
#include "./op_trait.h"
#include "./profiler/formats.h"
namespace
mgb
{
namespace
imperative
{
namespace
{
DeviceTimer
::
SharedEvent
alloc_recorded_event
(
CompNode
device
)
{
auto
event
=
EventPool
::
with_timer
().
alloc_shared
(
device
);
event
->
record
();
return
event
;
uint64_t
Timer
::
get_nsecs
()
{
using
namespace
std
::
chrono
;
auto
finish
=
steady_clock
::
now
();
auto
duration
=
duration_cast
<
nanoseconds
>
(
finish
-
m_start
);
return
duration
.
count
();
}
}
// namespace
DeviceTimer
::
SharedEvent
DeviceTimer
::
get_device_time
(
CompNode
device
)
{
return
alloc_recorded_event
(
device
);
uint64_t
Timer
::
get_started_at
()
{
return
m_started_at
;
}
SmallVector
<
DeviceTimer
::
SharedEvent
>
DeviceTimer
::
get_all
(
SmallVector
<
CompNode
>
device_list
)
{
SmallVector
<
DeviceTimer
::
SharedEvent
>
results
;
for
(
auto
&&
device
:
device_list
)
{
results
.
push_back
(
alloc_recorded_event
(
device
));
}
return
results
;
void
Timer
::
reset
()
{
using
namespace
std
::
chrono
;
m_start
=
steady_clock
::
now
();
auto
now_ns
=
duration_cast
<
nanoseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
());
m_started_at
=
now_ns
.
count
();
}
double
HostTimer
::
get_msecs
()
{
using
namespace
std
::
chrono
;
auto
finish
=
steady_clock
::
now
();
auto
duration
=
duration_cast
<
microseconds
>
(
finish
-
m_start
);
return
(
double
)
duration
.
count
()
/
1e3
;
std
::
shared_ptr
<
CompNode
::
Event
>
Timer
::
record_event
(
CompNode
device
)
{
auto
event
=
EventPool
::
with_timer
().
alloc_shared
(
device
);
event
->
record
();
return
event
;
}
double
HostTimer
::
get_started_at
()
{
return
m_started_at
;
Profiler
::
options_t
Profiler
::
sm_profile_options
;
std
::
mutex
Profiler
::
sm_mutex
;
std
::
unordered_map
<
std
::
thread
::
id
,
Profiler
*>
Profiler
::
sm_profilers
;
Timer
Profiler
::
sm_timer
;
std
::
atomic_uint64_t
Profiler
::
sm_last_id
=
0
;
bool
Profiler
::
sm_profiling
=
false
;
thread_local
std
::
unique_ptr
<
Profiler
>
Profiler
::
tm_profiler
=
std
::
make_unique
<
Profiler
>
();
std
::
atomic_size_t
Profiler
::
sm_preferred_capacity
;
auto
Profiler
::
get_thread_dict
()
->
thread_dict_t
{
MGB_LOCK_GUARD
(
sm_mutex
);
thread_dict_t
thread_dict
;
for
(
auto
&&
[
tid
,
profiler
]
:
sm_profilers
)
{
thread_dict
[
tid
]
=
profiler
->
m_thread_name
;
}
return
thread_dict
;
}
void
HostTimer
::
reset
(
)
{
using
namespace
std
::
chrono
;
m_start
=
steady_clock
::
now
();
auto
now_us
=
duration_cast
<
microseconds
>
(
std
::
chrono
::
system_clock
::
now
().
time_since_epoch
());
m_started_at
=
(
double
)(
now_us
.
count
())
/
1e3
;
void
Profiler
::
dump_profile
(
std
::
string
basename
,
std
::
string
format
,
results_t
results
,
options_t
options
)
{
auto
thread_dict
=
get_thread_dict
()
;
{
mgb_log_error
(
"unsupported profiling format %s"
,
format
.
c_str
());
}
}
}
// namespace imperative
...
...
imperative/src/impl/profiler/chrome_timeline.cpp
已删除
100644 → 0
浏览文件 @
f2027b8d
#include <string>
#include <memory>
#include "megbrain/utils/json.h"
namespace
mgb
{
namespace
imperative
{
class
ChromeTraceEvent
{
public:
ChromeTraceEvent
&
name
(
std
::
string
name
)
{
m_name
=
std
::
move
(
name
);
return
*
this
;
}
ChromeTraceEvent
&
tid
(
uint64_t
tid
)
{
m_tid
=
std
::
move
(
tid
);
return
*
this
;
}
ChromeTraceEvent
&
cat
(
std
::
string
cat
)
{
m_cat
=
std
::
move
(
cat
);
return
*
this
;
}
ChromeTraceEvent
&
pid
(
uint64_t
pid
)
{
m_pid
=
pid
;
return
*
this
;
}
ChromeTraceEvent
&
id
(
uint64_t
id
)
{
m_id
=
id
;
return
*
this
;
}
ChromeTraceEvent
&
idx
(
uint64_t
idx
)
{
m_idx
=
idx
;
return
*
this
;
}
ChromeTraceEvent
&
ts
(
double
ts
)
{
m_ts
=
ts
;
return
*
this
;
}
ChromeTraceEvent
&
dur
(
double
dur
)
{
m_dur
=
dur
;
return
*
this
;
}
ChromeTraceEvent
&
ph
(
char
ph
)
{
m_ph
=
ph
;
return
*
this
;
}
ChromeTraceEvent
&
bp
(
char
bp
)
{
m_bp
=
bp
;
return
*
this
;
}
ChromeTraceEvent
&
args
(
std
::
shared_ptr
<
json
::
Object
>
args
)
{
m_args
=
std
::
move
(
args
);
return
*
this
;
}
ChromeTraceEvent
&
arg
(
std
::
string
key
,
std
::
string
value
)
{
if
(
!
m_args
)
{
m_args
=
json
::
Object
::
make
();
}
(
*
m_args
)[
key
]
=
json
::
String
::
make
(
value
);
return
*
this
;
}
ChromeTraceEvent
&
arg
(
std
::
string
key
,
double
value
)
{
if
(
!
m_args
)
{
m_args
=
json
::
Object
::
make
();
}
(
*
m_args
)[
key
]
=
json
::
Number
::
make
(
value
);
return
*
this
;
}
ChromeTraceEvent
&
arg
(
std
::
string
key
,
std
::
shared_ptr
<
json
::
Value
>
value
)
{
if
(
!
m_args
)
{
m_args
=
json
::
Object
::
make
();
}
(
*
m_args
)[
key
]
=
value
;
return
*
this
;
}
std
::
shared_ptr
<
json
::
Object
>
to_json
()
const
{
auto
result
=
json
::
Object
::
make
();
auto
prop_str
=
[
&
](
auto
key
,
auto
value
)
{
if
(
value
.
empty
())
{
return
;
}
(
*
result
)[
key
]
=
json
::
String
::
make
(
value
);
};
auto
prop_num
=
[
&
](
auto
key
,
auto
value
)
{
if
(
!
value
)
{
return
;
}
(
*
result
)[
key
]
=
json
::
Number
::
make
(
value
.
value
());
};
auto
prop_char
=
[
&
](
auto
key
,
auto
value
)
{
if
(
!
value
)
{
return
;
}
(
*
result
)[
key
]
=
json
::
String
::
make
(
std
::
string
{}
+
value
.
value
());
};
prop_str
(
"name"
,
m_name
);
prop_num
(
"tid"
,
m_tid
);
prop_str
(
"cat"
,
m_cat
);
prop_num
(
"pid"
,
m_pid
);
prop_num
(
"id"
,
m_id
);
prop_num
(
"idx"
,
m_idx
);
prop_num
(
"ts"
,
m_ts
);
prop_num
(
"dur"
,
m_dur
);
prop_char
(
"ph"
,
m_ph
);
prop_char
(
"bp"
,
m_bp
);
if
(
m_args
)
{
(
*
result
)[
"args"
]
=
m_args
;
}
return
result
;
}
private:
std
::
string
m_name
;
std
::
string
m_cat
;
std
::
optional
<
uint64_t
>
m_tid
;
std
::
optional
<
uint64_t
>
m_pid
;
std
::
optional
<
uint64_t
>
m_id
;
std
::
optional
<
uint64_t
>
m_idx
;
std
::
optional
<
double
>
m_ts
;
std
::
optional
<
double
>
m_dur
;
std
::
optional
<
char
>
m_ph
;
std
::
optional
<
char
>
m_bp
;
std
::
shared_ptr
<
json
::
Object
>
m_args
;
};
class
ChromeTraceEventList
{
public:
ChromeTraceEvent
&
new_event
()
{
m_content
.
emplace_back
();
return
m_content
.
back
();
}
std
::
shared_ptr
<
json
::
Array
>
to_json
()
const
{
auto
result
=
json
::
Array
::
make
();
for
(
auto
&&
event
:
m_content
)
{
result
->
add
(
event
.
to_json
());
}
return
result
;
}
private:
std
::
vector
<
ChromeTraceEvent
>
m_content
;
};
}
// namespace imperative
}
// namespace mgb
imperative/src/impl/
interpret
er/events.h
→
imperative/src/impl/
profil
er/events.h
浏览文件 @
1d64792b
...
...
@@ -11,65 +11,176 @@
#pragma once
#include "./commands.h"
#include "./tensor_info.h"
#include "megbrain/utils/small_vector.h"
namespace
mgb
::
imperative
::
interpreter
::
intl
{
#include "../op_trait.h"
namespace
mgb
::
imperative
::
profiler
{
enum
class
TensorProp
{
InvalidProp
,
Device
,
Shape
,
DType
,
DevValue
,
HostValue
,
};
using
OpParams
=
std
::
unordered_map
<
std
::
string
,
std
::
string
>
;
}
namespace
mgb
::
imperative
{
template
<
>
struct
ToStringTrait
<
profiler
::
TensorProp
>
{
using
TensorProp
=
profiler
::
TensorProp
;
std
::
string
operator
()(
TensorProp
prop
)
const
{
switch
(
prop
)
{
case
TensorProp
::
DType
:
return
"dtype"
;
case
TensorProp
::
DevValue
:
return
"dev_value"
;
case
TensorProp
::
Device
:
return
"device"
;
case
TensorProp
::
HostValue
:
return
"host_value"
;
case
TensorProp
::
Shape
:
return
"shape"
;
default:
return
"unknown"
;
}
}
};
}
namespace
mgb
::
imperative
::
profiler
{
#define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__;
#define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__;
DEF_EVENT
(
Command
,
{
IdentifiedCommand
icmd
;
DEF_EVENT
(
OpDispatch
,
{
uint64_t
op_id
;
std
::
string
op_name
;
std
::
function
<
OpParams
()
>
op_params
;
SmallVector
<
uint64_t
>
inputs
;
SmallVector
<
uint64_t
>
outputs
;
});
DEF_DUR_EVENT
(
OpInput
,
{
uint64_t
tensor_id
;
TensorShape
shape
;
});
DEF_DUR_EVENT
(
OpDel
,
{
uint64_t
tensor_id
;
TensorShape
shape
;
});
DEF_DUR_EVENT
(
OpOutput
,
{
uint64_t
tensor_id
;
TensorShape
shape
;
});
DEF_EVENT
(
CommandEnqueue
,
:
CommandEvent
{});
DEF_EVENT
(
CommandExecute
,
:
CommandEvent
{});
DEF_EVENT
(
CommandFinish
,
:
CommandEvent
{});
DEF_DUR_EVENT
(
OpExecute
,
{
uint64_t
id
;
std
::
shared_ptr
<
OpDef
>
op
;
SmallVector
<
uint64_t
>
inputs
;
SmallVector
<
uint64_t
>
outputs
;
uint64_t
op_id
;
});
DEF_DUR_EVENT
(
OpPostExecute
,
{
uint64_t
op_id
;
});
DEF_DUR_EVENT
(
KernelExecute
,
{
uint64_t
id
;
std
::
shared_ptr
<
OpDef
>
op
;
SmallVector
<
uint64_t
>
inputs
;
SmallVector
<
uint64_t
>
outputs
;
uint64_t
op_id
;
uint64_t
kernel_id
;
std
::
shared_ptr
<
CompNode
::
Event
>
event
;
});
DEF_EVENT
(
TensorDeclare
,
{
uint64_t
tensor_id
;
std
::
string
name
;
});
DEF_EVENT
(
TensorProduce
,
{
uint64_t
tensor_id
;
TensorLayout
layout
;
CompNode
device
;
void
*
ptr
;
});
DEF_EVENT
(
TensorUsage
,
{
uint64_t
tensor_id
;
});
DEF_EVENT
(
TensorRelease
,
{
uint64_t
tensor_id
;
});
DEF_EVENT
(
TensorErase
,
{
uint64_t
tensor_id
;
size_t
use_count
;
});
DEF_EVENT
(
TensorGetProp
,
{
uint64_t
tensor_id
;
TensorInfo
::
Prop
prop
;
std
::
string
prop_desc
;
TensorProp
prop
;
});
DEF_EVENT
(
TensorNotifyProp
,
{
uint64_t
tensor_id
;
uint64_t
wait_id
;
TensorProp
prop
;
});
DEF_DUR_EVENT
(
TensorWaitProp
,
{
DEF_EVENT
(
TensorWaitProp
,
{
uint64_t
tensor_id
;
TensorInfo
::
Prop
prop
;
std
::
string
prop_desc
;
uint64_t
wait_id
;
TensorProp
prop
;
});
DEF_EVENT
(
TensorNotifyProp
,
{
DEF_EVENT
(
TensorWaitPropFinish
,
{
uint64_t
tensor_id
;
TensorInfo
::
Prop
prop
;
std
::
string
prop_desc
;
uint64_t
wait_id
;
TensorProp
prop
;
bool
notified
;
});
DEF_DUR_EVENT
(
Sync
,
{});
DEF_DUR_EVENT
(
SampleDevice
,
{
CompNode
device
;
size_t
total_memory
;
size_t
free_memory
;
});
DEF_EVENT
(
WorkerException
,
{});
DEF_EVENT
(
ShapeInfer
,
{
bool
success
;
});
DEF_DUR_EVENT
(
Scope
,
{
std
::
string
name
;
});
DEF_DUR_EVENT
(
DeviceScope
,
{
std
::
string
name
;
std
::
shared_ptr
<
CompNode
::
Event
>
event
;
});
DEF_DUR_EVENT
(
Sync
,
{});
DEF_DUR_EVENT
(
StartProfile
,
{
size_t
capture_count
;
});
DEF_DUR_EVENT
(
StopProfile
,
{
size_t
escape_count
;
});
DEF_DUR_EVENT
(
TensorCommand
,
{
enum
Kind
{
Put
,
Del
,
SwapIn
,
SwapOut
,
Drop
,
ReGen
,
RecFree
,
GetValue
};
uint64_t
tensor_id
;
Kind
kind
;
});
#undef DEF_EVENT
#undef DEF_DUR_EVENT
}
imperative/src/impl/
interpreter/profiler.cpp
→
imperative/src/impl/
profiler/formats.h
浏览文件 @
1d64792b
/**
* \file imperative/src/impl/interpreter/profiler.
cpp
* \file imperative/src/impl/interpreter/profiler.
h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
...
...
@@ -9,22 +9,12 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#
include "./profiler.h"
#
pragma once
#include <sstream>
#include <cinttypes>
#include <unordered_set>
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
#include <unistd.h>
#elif defined(_WIN32)
#include <process.h>
#else
#error Unsupported platform
#endif
#include "../op_trait.h"
namespace
mgb
::
imperative
::
interpreter
::
intl
{
#include "megbrain/imperative/profiler.h"
namespace
mgb
::
imperative
::
profiler
{
}
imperative/src/impl/profiler/states.h
浏览文件 @
1d64792b
...
...
@@ -6,6 +6,8 @@
#include "megbrain/tensor.h"
#include "./events.h"
namespace
mgb
::
imperative
::
profiler
{
struct
ProfileDeviceState
{
...
...
@@ -53,6 +55,7 @@ struct ProfileStaticsState {
struct
ProfileOperatorState
{
uint64_t
id
;
std
::
string
name
;
OpParams
params
;
SmallVector
<
uint64_t
>
inputs
;
SmallVector
<
uint64_t
>
outputs
;
CompNode
device
;
...
...
imperative/src/include/megbrain/imperative/interpreter.h
浏览文件 @
1d64792b
...
...
@@ -47,8 +47,8 @@ struct Interpreter {
virtual
size_t
get_option
(
std
::
string
name
)
=
0
;
virtual
void
set_option
(
std
::
string
name
,
size_t
value
)
=
0
;
virtual
void
start_profile
(
std
::
unordered_map
<
std
::
string
,
int
>
option
)
=
0
;
virtual
void
stop_profile
(
std
::
string
basename
,
std
::
string
format
)
=
0
;
virtual
void
start_profile
()
=
0
;
virtual
void
stop_profile
()
=
0
;
virtual
void
push_scope
(
std
::
string
name
)
=
0
;
virtual
void
pop_scope
(
std
::
string
name
)
=
0
;
...
...
imperative/src/include/megbrain/imperative/profiler.h
浏览文件 @
1d64792b
...
...
@@ -17,6 +17,9 @@
#include <fstream>
#include <chrono>
#include <bitset>
#include <deque>
#include <any>
#include <typeindex>
#include "megbrain/comp_node.h"
#include "megbrain/graph/event.h"
...
...
@@ -29,165 +32,188 @@
namespace
mgb
{
namespace
imperative
{
class
DeviceTimer
{
public:
using
SharedEvent
=
std
::
shared_ptr
<
CompNode
::
Event
>
;
DeviceTimer
()
=
default
;
SharedEvent
get_device_time
(
CompNode
device
);
SmallVector
<
SharedEvent
>
get_all
(
SmallVector
<
CompNode
>
device_list
);
};
class
HostTimer
{
class
Timer
{
public:
void
reset
();
double
get_msecs
();
double
get_started_at
();
uint64_t
get_nsecs
();
uint64_t
get_started_at
();
static
std
::
shared_ptr
<
CompNode
::
Event
>
record_event
(
CompNode
device
);
private:
decltype
(
std
::
chrono
::
steady_clock
::
now
())
m_start
;
double
m_started_at
;
uint64_t
m_started_at
;
};
class
Profiler
Base
{
class
Profiler
{
public:
using
Host
=
std
::
thread
::
id
;
using
Device
=
CompNode
;
struct
HostInstant
{
Host
tid
;
double
time
;
void
wait
()
const
{}
struct
Record
{
uint64_t
id
;
uint64_t
time
;
//in ns
std
::
any
data
;
};
struct
DeviceInstant
{
double
before
;
std
::
shared_ptr
<
CompNode
::
Event
>
event
;
double
after
;
void
wait
()
const
{
event
->
host_wait
();
}
enum
Status
:
uint8_t
{
Running
=
0
,
Recording
=
1
,
Collecting
=
2
,
};
using
ProfileCollector
=
std
::
function
<
void
(
std
::
thread
::
id
,
Record
)
>
;
using
option_t
=
uint64_t
;
using
options_t
=
std
::
unordered_map
<
std
::
string
,
option_t
>
;
using
result_t
=
std
::
pair
<
std
::
thread
::
id
,
Record
>
;
using
results_t
=
std
::
vector
<
result_t
>
;
using
thread_dict_t
=
std
::
unordered_map
<
std
::
thread
::
id
,
std
::
string
>
;
private:
std
::
thread
::
id
m_thread_id
;
std
::
vector
<
Record
>
m_records
;
std
::
atomic
<
Status
>
m_status
=
Running
;
uint64_t
m_last_time
=
0
;
std
::
string
m_thread_name
;
static
options_t
sm_profile_options
;
static
std
::
mutex
sm_mutex
;
static
std
::
unordered_map
<
std
::
thread
::
id
,
Profiler
*>
sm_profilers
;
static
Timer
sm_timer
;
static
std
::
atomic_uint64_t
sm_last_id
;
static
std
::
atomic_size_t
sm_preferred_capacity
;
static
bool
sm_profiling
;
static
constexpr
bool
sm_debug
=
false
;
thread_local
static
std
::
unique_ptr
<
Profiler
>
tm_profiler
;
public:
Profiler
()
{
m_thread_id
=
std
::
this_thread
::
get_id
();
MGB_LOCK_GUARD
(
sm_mutex
);
if
(
sm_profilers
.
size
()
==
0
)
{
reset
();
}
mgb_assert
(
sm_profilers
.
count
(
m_thread_id
)
==
0
);
sm_profilers
[
m_thread_id
]
=
this
;
}
~
Profiler
()
{
MGB_LOCK_GUARD
(
sm_mutex
);
mgb_assert
(
sm_profilers
.
count
(
m_thread_id
)
==
1
);
sm_profilers
.
erase
(
m_thread_id
);
}
public:
static
Profiler
&
get_instance
()
{
return
*
tm_profiler
;
}
using
Instant
=
std
::
variant
<
HostInstant
,
DeviceInstant
>
;
static
void
reset
()
{
mgb_assert
(
sm_profilers
.
size
()
==
0
,
"profiler already running"
);
sm_timer
.
reset
();
}
template
<
typename
TEvent
>
struct
EventRecord
{
Instant
instant
;
TEvent
data
;
static
uint64_t
next_id
()
{
return
sm_last_id
++
;
}
const
HostInstant
&
host
()
const
{
return
std
::
get
<
HostInstant
>
(
instant
);
template
<
typename
T
,
typename
...
TArgs
>
static
uint64_t
record
(
TArgs
&&
...
args
)
{
auto
&
profiler
=
get_instance
();
auto
last_time
=
profiler
.
m_last_time
;
if
constexpr
(
sm_debug
)
{
Status
expected
=
Running
;
mgb_assert
(
profiler
.
m_status
.
compare_exchange_strong
(
expected
,
Recording
));
}
const
DeviceInstant
&
device
()
const
{
return
std
::
get
<
DeviceInstant
>
(
instant
);
uint64_t
id
=
next_id
();
uint64_t
time
=
sm_timer
.
get_nsecs
();
time
=
std
::
max
(
time
,
last_time
+
2000
);
profiler
.
m_last_time
=
time
;
profiler
.
m_records
.
push_back
({
id
,
time
,
T
{
std
::
forward
<
TArgs
>
(
args
)...}});
if
constexpr
(
sm_debug
)
{
Status
expected
=
Recording
;
mgb_assert
(
profiler
.
m_status
.
compare_exchange_strong
(
expected
,
Running
));
}
return
id
;
}
void
wait
()
const
{
std
::
visit
([
&
](
const
auto
&
instant
){
instant
.
wait
();
},
instant
);
static
results_t
collect
()
{
MGB_LOCK_GUARD
(
sm_mutex
);
if
constexpr
(
sm_debug
)
{
for
(
auto
&&
[
tid
,
profiler
]
:
sm_profilers
)
{
Status
expected
=
Running
;
mgb_assert
(
profiler
->
m_status
.
compare_exchange_strong
(
expected
,
Collecting
));
}
}
};
protected:
HostInstant
record_host
()
{
return
{
std
::
this_thread
::
get_id
(),
m_host_timer
.
get_msecs
()};
std
::
vector
<
std
::
pair
<
std
::
thread
::
id
,
Record
>>
profile_data
;
for
(
auto
&&
[
tid
,
profiler
]
:
sm_profilers
)
{
sm_preferred_capacity
=
std
::
max
(
sm_preferred_capacity
.
load
(),
profiler
->
m_records
.
size
());
for
(
auto
&
record
:
profiler
->
m_records
)
{
profile_data
.
push_back
({
tid
,
std
::
move
(
record
)});
}
profiler
->
m_records
.
clear
();
profiler
->
m_records
.
reserve
(
sm_preferred_capacity
);
}
std
::
sort
(
profile_data
.
begin
(),
profile_data
.
end
(),
[](
auto
&
lhs
,
auto
&
rhs
){
return
lhs
.
second
.
id
<
rhs
.
second
.
id
;
});
if
constexpr
(
sm_debug
)
{
for
(
auto
&&
[
tid
,
profiler
]
:
sm_profilers
)
{
Status
expected
=
Collecting
;
mgb_assert
(
profiler
->
m_status
.
compare_exchange_strong
(
expected
,
Running
));
}
}
return
profile_data
;
}
DeviceInstant
record_device
(
Device
device
)
{
auto
before
=
m_host_timer
.
get_msecs
();
auto
event
=
m_device_timer
.
get_device_time
(
device
);
auto
after
=
m_host_timer
.
get_msecs
();
return
{
before
,
event
,
after
};
static
option_t
get_option
(
std
::
string
key
,
option_t
default_val
)
{
if
(
!
sm_profile_options
.
count
(
key
))
{
return
default_val
;
}
return
sm_profile_options
.
at
(
key
);
}
protected:
std
::
atomic_int64_t
m_last_id
=
0
;
HostTimer
m_host_timer
;
DeviceTimer
m_device_timer
;
Spinlock
m_lock
;
};
static
void
load_options
(
options_t
options
)
{
sm_profile_options
=
std
::
move
(
options
);
}
template
<
typename
...
TEvents
>
class
Profiler
:
public
ProfilerBase
{
public:
using
Record
=
std
::
variant
<
EventRecord
<
TEvents
>
...
>
;
using
Mask
=
std
::
bitset
<
sizeof
...(
TEvents
)
>
;
static
options_t
get_options
()
{
return
sm_profile_options
;
}
struct
Data
{
std
::
vector
<
Record
>
records
;
double
started_at
;
};
static
bool
is_profiling
()
{
return
sm_profiling
;
}
template
<
typename
TEvent
,
size_t
index
=
0
>
static
constexpr
size_t
index_of
()
{
if
constexpr
(
index
==
std
::
variant_size_v
<
Record
>
)
{
return
index
;
}
else
if
constexpr
(
std
::
is_same_v
<
EventRecord
<
TEvent
>
,
std
::
variant_alternative_t
<
index
,
Record
>>
)
{
return
index
;
}
else
{
return
index_of
<
TEvent
,
index
+
1
>
();
}
};
static
void
start_profile
()
{
mgb_assert
(
!
sm_profiling
);
sm_profiling
=
true
;
}
template
<
typename
...
TEvents2
>
static
Mask
mask_of
()
{
return
Mask
{}
|
(
Mask
{}.
set
(
index_of
<
TEvents2
>
())
|
...)
;
static
void
stop_profile
()
{
mgb_assert
(
sm_profiling
);
sm_profiling
=
false
;
}
enum
Status
{
NotStarted
,
Profiling
,
Stopped
};
static
thread_dict_t
get_thread_dict
();
static
void
dump_profile
(
std
::
string
basename
,
std
::
string
format
,
results_t
results
,
options_t
options
);
};
class
ProfileDataCollector
{
public:
template
<
typename
TEvent
,
typename
...
TArgs
>
void
record_host
(
TArgs
&&
...
args
)
{
MGB_LOCK_GUARD
(
m_lock
);
if
(
!
m_event_mask
.
test
(
index_of
<
TEvent
>
()))
{
return
;
}
mgb_assert
(
m_status
!=
Stopped
,
"record after stop"
);
auto
instant
=
HostInstant
{
std
::
this_thread
::
get_id
(),
m_host_timer
.
get_msecs
()};
m_record_list
.
emplace_back
(
EventRecord
<
TEvent
>
{
std
::
move
(
instant
),
{
std
::
forward
<
TArgs
>
(
args
)...}});
template
<
typename
T
>
using
SubCollector
=
std
::
function
<
void
(
uint64_t
,
std
::
thread
::
id
,
uint64_t
,
T
)
>
;
private:
std
::
unordered_map
<
std
::
type_index
,
SubCollector
<
std
::
any
>>
m_collectors
;
public:
template
<
typename
T
>
ProfileDataCollector
&
handle
(
SubCollector
<
T
>
collector
)
{
auto
erased
=
[
collector
](
uint64_t
id
,
std
::
thread
::
id
tid
,
uint64_t
time
,
std
::
any
data
){
collector
(
id
,
tid
,
time
,
std
::
any_cast
<
T
>
(
std
::
move
(
data
)));
};
m_collectors
[
typeid
(
T
)]
=
erased
;
return
*
this
;
}
template
<
typename
TEvent
,
typename
...
TArgs
>
void
record_device
(
Device
device
,
TArgs
&&
...
args
)
{
MGB_LOCK_GUARD
(
m_lock
);
if
(
!
m_event_mask
.
test
(
index_of
<
TEvent
>
()))
{
void
operator
()(
uint64_t
id
,
std
::
thread
::
id
tid
,
uint64_t
time
,
std
::
any
event
)
{
std
::
type_index
type
=
event
.
type
();
if
(
m_collectors
.
count
(
type
)
==
0
)
{
return
;
}
mgb_assert
(
m_status
!=
Stopped
,
"record after stop"
);
auto
before
=
m_host_timer
.
get_msecs
();
auto
event
=
m_device_timer
.
get_device_time
(
device
);
auto
after
=
m_host_timer
.
get_msecs
();
auto
instant
=
DeviceInstant
{
before
,
event
,
after
};
m_record_list
.
emplace_back
(
EventRecord
<
TEvent
>
{
std
::
move
(
instant
),
{
std
::
forward
<
TArgs
>
(
args
)...}});
}
// unsafe
bool
is_profiling
()
{
return
m_status
==
Profiling
;
}
void
start
(
Mask
mask
)
{
MGB_LOCK_GUARD
(
m_lock
);
mgb_assert
(
m_status
==
NotStarted
,
"profiler already started"
);
m_status
=
Profiling
;
m_event_mask
=
mask
;
m_host_timer
.
reset
();
}
Data
stop
()
{
MGB_LOCK_GUARD
(
m_lock
);
mgb_assert
(
m_status
==
Profiling
,
"profiler not active"
);
m_status
=
Stopped
;
for
(
auto
&&
record
:
m_record_list
)
{
std
::
visit
([
&
](
const
auto
&
record
){
record
.
wait
();
},
record
);
}
auto
records
=
std
::
move
(
m_record_list
);
return
{
records
,
m_host_timer
.
get_started_at
()
};
auto
&
handler
=
m_collectors
.
at
(
type
);
handler
(
id
,
tid
,
time
,
std
::
move
(
event
));
}
protected:
std
::
vector
<
Record
>
m_record_list
;
Mask
m_event_mask
;
std
::
atomic
<
Status
>
m_status
=
NotStarted
;
};
}
// namespace imperative
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录