Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
b36b5bd8
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
395
Star
4704
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b36b5bd8
编写于
6月 01, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor(mgb): check input when profiling
GitOrigin-RevId: 1d722dd7418a903d5de4df96f97a3b6033c41aff
上级
6c9b3a58
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
115 addition
and
44 deletion
+115
-44
imperative/python/megengine/__init__.py
imperative/python/megengine/__init__.py
+0
-8
imperative/python/megengine/core/_config.py
imperative/python/megengine/core/_config.py
+30
-0
src/core/impl/system.cpp
src/core/impl/system.cpp
+1
-1
src/rdnn/impl/algo_chooser.cpp
src/rdnn/impl/algo_chooser.cpp
+42
-16
src/rdnn/impl/profiler.cpp
src/rdnn/impl/profiler.cpp
+42
-19
未找到文件。
imperative/python/megengine/__init__.py
浏览文件 @
b36b5bd8
...
...
@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
from
.core._imperative_rt.common
import
(
get_supported_sm_versions
as
_get_supported_sm_versions
,
)
from
.core._imperative_rt.utils
import
_set_fork_exec_path_for_timed_func
from
.config
import
*
from
.device
import
*
from
.logger
import
enable_debug_log
,
get_logger
,
set_log_file
,
set_log_level
...
...
@@ -118,13 +117,6 @@ def _check_sm_version():
_check_sm_version
()
_set_fork_exec_path_for_timed_func
(
sys
.
executable
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"utils"
,
"_timed_func_fork_exec_entry.py"
),
)
del
_set_fork_exec_path_for_timed_func
_exit_handlers
=
[]
...
...
imperative/python/megengine/core/_config.py
浏览文件 @
b36b5bd8
...
...
@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
__compute_mode
=
"default"
_benchmark_kernel
=
False
_deterministic_kernel
=
False
_benchmark_with_subprocess
=
False
__all__
=
[
"benchmark_kernel"
,
"benchmark_with_subprocess"
,
"deterministic_kernel"
,
"async_level"
,
"disable_memory_forwarding"
,
...
...
@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool):
_deterministic_kernel
=
option
@
property
def
benchmark_with_subprocess
(
mod
):
r
"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
which means use heuristic to choose the fastest algorithm.
Examples:
.. code-block::
import megengine as mge
mge.config.benchmark_with_subprocess = True
"""
return
_benchmark_with_subprocess
@
benchmark_with_subprocess
.
setter
def
benchmark_with_subprocess
(
mod
,
option
:
bool
):
if
option
:
import
sys
from
._imperative_rt.utils
import
_set_fork_exec_path_for_timed_func
_set_fork_exec_path_for_timed_func
(
sys
.
executable
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../utils"
,
"_timed_func_fork_exec_entry.py"
),
)
@
property
def
async_level
(
mod
)
->
int
:
r
"""Get or set config whether raise error exactly when invoking op. The default level is 2,
...
...
src/core/impl/system.cpp
浏览文件 @
b36b5bd8
...
...
@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
return
iter
->
second
.
direct_call
(
param
);
if
(
!
m_fork_exec_impl
)
{
mgb_log_
warn
(
mgb_log_
debug
(
"timeout is set, but no fork_exec_impl not given; "
"timeout would be ignored"
);
return
iter
->
second
.
direct_call
(
param
);
...
...
src/rdnn/impl/algo_chooser.cpp
浏览文件 @
b36b5bd8
...
...
@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
auto
&&
search_items
=
flatten_search_space
<
Opr
>
(
*
this
,
circular_deps_checker
);
FOREACH_OPR_TYPE_DISPATCH
(
search_items
,
{
auto
&&
megdnn_opr
=
opr
::
intl
::
create_megdnn_opr
<
_Opr
>
(
m_cn
);
// skip different sub opr, for example:
// skip matmul algo when profiling convolution
if
(
m_dnn_opr
->
get_opr_type
()
!=
megdnn_opr
->
get_opr_type
())
continue
;
megdnn_opr
->
param
()
=
Algorithm
::
deserialize_read_pod
<
typename
_Opr
::
Param
>
(
_item
.
param
);
typename
AlgoChooser
<
_Opr
>::
AlgoChooserHelper
sub_helper
(
...
...
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
// result, retrive_from_cache = true, allow_log = true
typename
AlgoChooser
<
Opr
>::
ImplExecutionPolicy
policy
;
construct_execution_policy
(
selected_strategy
,
policy
);
if
(
policy
.
algo
.
valid
())
return
policy
;
return
choose_by_heuristic
(
selected_strategy
);
MIDOUT_E
}
...
...
@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
::
MegDNNOpr2Typename
<
Opr
>::
name
,
layouts_str
.
c_str
(),
Algorithm
::
attribute_str
(
target_attr
.
first
).
c_str
(),
Algorithm
::
attribute_str
(
target_attr
.
second
).
c_str
());
mgb_log_
warn
(
mgb_log_
debug
(
"No algo get from cache for %s. This may caused by "
"mismatch with model and cache file or imcomplete "
"cache file. ex. profiling with version1, but "
...
...
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
if
(
!
rst
.
valid
())
return
None
;
// subprocess will return dbl_max when meomry limit is not satisfied
if
(
rst
.
val
().
time
==
std
::
numeric_limits
<
double
>::
max
())
return
None
;
std
::
string
algo_desc
;
serialize_write_pod
(
policy
.
algo
,
algo_desc
);
return
AlgoChooserProfileCache
::
ResultEntry
{
...
...
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
auto
&&
rst
=
get_profile_result_from_cache
(
selected_strategy
);
// rst.first.valid means there exists valid algorithms for current opr, just return
// otherwise need to profile
// in order to avoid reprofile in fastrun
if
(
rst
.
first
.
valid
())
return
;
AlgoChooserProfileCache
::
Result
prof_rst
;
...
...
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
std
::
string
layouts_str
=
AlgoChooser
::
format_fixlayouts
(
m_fastrun_layouts
);
double
cur_timeout
=
0
;
size_t
data_size
=
0
;
for
(
auto
ly
:
m_fastrun_layouts
)
data_size
+=
ly
.
span
().
dist_byte
();
auto
workspace_limit
=
m_desc
.
get_workspace_limit
(
m_cn
,
m_execution_policy
.
workspace_limit
);
RealTimer
timer
;
...
...
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
ImplExecutionPolicy
policy
;
policy
.
algo
=
algo
.
desc
;
// skip naive algo, can not using attribute to determine naive algo, thus using
// strcmp
if
(
algo
.
desc
.
name
.
compare
(
"NAIVE"
)
==
0
)
{
continue
;
}
//! check negative attribute : skip negative attribute
auto
palgo
=
m_dnn_opr
->
get_algorithm_from_desc
(
policy
.
algo
);
if
(
palgo
->
contain_attribute_any
(
target_attr
.
second
))
{
...
...
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
//! check workspace limit
construct_execution_policy
(
selected_strategy
,
policy
);
mgb_assert
(
policy
.
algo
.
valid
(),
"construct execution policy must success when profiling"
);
if
(
get_workspace_size_bytes
(
policy
)
>
workspace_limit
)
{
// this will failed
// when construct matmul algorithm for convolution opr
if
(
!
policy
.
algo
.
valid
())
continue
;
size_t
workspace_needed
=
get_workspace_size_bytes
(
policy
);
if
(
data_size
+
workspace_needed
>
m_desc
.
get_workspace_limit
(
m_cn
,
m_execution_policy
.
workspace_limit
))
{
continue
;
}
...
...
@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
})
// megbrain uncatched exception
MGB_CATCH
(...,
{
mgb_log_
warn
(
"caught exception during %s"
,
msg
.
c_str
());
mgb_log_
debug
(
"caught exception during %s"
,
msg
.
c_str
());
continue
;
})
if
(
!
cur_rst
.
valid
())
{
...
...
@@ -982,12 +1006,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
"workspace limite requirement(%zu)"
,
::
MegDNNOpr2Typename
<
Opr
>::
name
,
layouts_str
.
c_str
(),
Algorithm
::
attribute_str
(
target_attr
.
second
).
c_str
(),
workspace_limit
);
mgb_assert
(
!
prof_rst
.
empty
(),
"%s"
,
msg
.
c_str
());
// allowed to have empty profile result for current opr
// append some previous profiled results
if
(
rst
.
second
.
valid
())
prof_rst
.
insert
(
prof_rst
.
end
(),
rst
.
second
.
val
().
begin
(),
rst
.
second
.
val
().
end
());
if
(
!
prof_rst
.
empty
())
{
FixedTensorLayouts
incache_layouts
=
m_incache_layouts
;
typename
Opr
::
Param
origin_param
=
m_dnn_opr
->
param
();
AlgoChooserProfileCache
::
Key
cache_key
{
...
...
@@ -996,6 +1021,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
AlgoChooserProfileCache
cache
(
m_cn
,
profile_name
(
m_dnn_opr
).
c_str
());
cache
.
put
(
cache_key
,
prof_rst
);
}
MIDOUT_E
}
...
...
src/rdnn/impl/profiler.cpp
浏览文件 @
b36b5bd8
...
...
@@ -245,9 +245,18 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}
});
{
// first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory)
megdnn
::
Algorithm
*
algo
=
megdnn_opr
->
get_algorithm_from_desc
(
megdnn_opr
->
execution_policy
().
algo
);
mgb_assert
(
algo
);
#if !MGB_BUILD_SLIM_SERVING
#if MGB_CUDA || MGB_ROCM
// if tot_size > workspace_limit, then skip current algo, return double_max
// this assertion is needed because when profiling algo with subprocess,
// child process would occupy some cuda memory for initialization
// this assertion is the most accurate than before
size_t
workspace_limit
=
std
::
max
(
cn
.
get_free_mem
(),
cn
.
get_max_block_size_available
());
auto
align
=
cn
.
get_mem_addr_alignment
();
size_t
tot_size
=
align
;
for
(
int
i
=
0
;
i
<
arity
;
++
i
)
{
...
...
@@ -257,9 +266,13 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
tot_size
+=
layout
.
span
().
high_byte
+
align
;
}
tot_size
+=
param
.
workspace
;
DeviceTensorStorage
storage
{
cn
};
storage
.
ensure_size
(
tot_size
);
if
(
tot_size
>
workspace_limit
)
{
mgb_log_debug
(
"current memory is not enouugh when profiling algo %s
\n
"
,
algo
->
name
());
return
TResult
::
from_pod
(
Result
{
std
::
numeric_limits
<
double
>::
max
()});
}
#endif
#endif
// allocate input and output memory
std
::
array
<
DeviceTensorND
,
arity_in
>
inp_val
;
...
...
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
});
ev_end
->
record
();
megdnn
::
Algorithm
*
algo
=
megdnn_opr
->
get_algorithm_from_desc
(
megdnn_opr
->
execution_policy
().
algo
);
mgb_assert
(
algo
);
double
next_report_time
=
0.5
;
while
(
!
ev_end
->
finished
())
{
if
(
timer
.
get_secs
()
>=
next_report_time
)
{
#if MGB_ENABLE_GETENV
mgb_log_debug
(
"profiling
conv
algo %s already took %.3f/%.3f secs"
"profiling algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) "
,
algo
->
name
(),
timer
.
get_secs
(),
param
.
actual_timeout
);
#else
mgb_log_debug
(
"profiling
conv
algo %s already took %.3f/%.3f secs"
,
algo
->
name
(),
"profiling algo %s already took %.3f/%.3f secs"
,
algo
->
name
(),
timer
.
get_secs
(),
param
.
actual_timeout
);
#endif
next_report_time
=
timer
.
get_secs
()
+
1
;
...
...
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
std
::
this_thread
::
sleep_for
(
1000us
);
#endif
}
DeviceTensorStorage
storage
;
for
(
int
i
=
0
;
i
<
arity_in
;
++
i
)
{
inp_val
[
i
].
reset
(
storage
,
TensorLayout
{});
}
for
(
int
i
=
0
;
i
<
arity_out
;
++
i
)
{
out_val
[
i
].
reset
(
storage
,
TensorLayout
{});
}
for
(
size_t
i
=
0
;
i
<
preprocessed_layout
.
size
();
i
++
)
{
flt_val
[
i
].
reset
(
storage
,
TensorLayout
{});
}
mdn_workspace
=
megdnn
::
Workspace
{};
workspace
.
reset
(
storage
,
TensorLayout
{});
// release all free blocks owned by child process,
// in order to avoid main process running out of memory
cn
.
try_coalesce_all_free_memory
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录