提交 b36b5bd8 编写于 作者: M Megvii Engine Team

refactor(mgb): check input when profiling

GitOrigin-RevId: 1d722dd7418a903d5de4df96f97a3b6033c41aff
上级 6c9b3a58
...@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync ...@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import ( from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions, get_supported_sm_versions as _get_supported_sm_versions,
) )
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import * from .config import *
from .device import * from .device import *
from .logger import enable_debug_log, get_logger, set_log_file, set_log_level from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
...@@ -118,13 +117,6 @@ def _check_sm_version(): ...@@ -118,13 +117,6 @@ def _check_sm_version():
_check_sm_version() _check_sm_version()
_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
)
del _set_fork_exec_path_for_timed_func
_exit_handlers = [] _exit_handlers = []
......
...@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import ( ...@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
__compute_mode = "default" __compute_mode = "default"
_benchmark_kernel = False _benchmark_kernel = False
_deterministic_kernel = False _deterministic_kernel = False
_benchmark_with_subprocess = False
__all__ = [ __all__ = [
"benchmark_kernel", "benchmark_kernel",
"benchmark_with_subprocess",
"deterministic_kernel", "deterministic_kernel",
"async_level", "async_level",
"disable_memory_forwarding", "disable_memory_forwarding",
...@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool): ...@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool):
_deterministic_kernel = option _deterministic_kernel = option
@property
def benchmark_with_subprocess(mod):
r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
which means use heuristic to choose the fastest algorithm.
Examples:
.. code-block::
import megengine as mge
mge.config.benchmark_with_subprocess = True
"""
return _benchmark_with_subprocess
@benchmark_with_subprocess.setter
def benchmark_with_subprocess(mod, option: bool):
if option:
import sys
from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func
_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(
os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
),
)
@property @property
def async_level(mod) -> int: def async_level(mod) -> int:
r"""Get or set config whether raise error exactly when invoking op. The default level is 2, r"""Get or set config whether raise error exactly when invoking op. The default level is 2,
......
...@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker { ...@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
return iter->second.direct_call(param); return iter->second.direct_call(param);
if (!m_fork_exec_impl) { if (!m_fork_exec_impl) {
mgb_log_warn( mgb_log_debug(
"timeout is set, but no fork_exec_impl not given; " "timeout is set, but no fork_exec_impl not given; "
"timeout would be ignored"); "timeout would be ignored");
return iter->second.direct_call(param); return iter->second.direct_call(param);
......
...@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp ...@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker); auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
FOREACH_OPR_TYPE_DISPATCH(search_items, { FOREACH_OPR_TYPE_DISPATCH(search_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
// skip different sub opr, for example:
// skip matmul algo when profiling convolution
if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
continue;
megdnn_opr->param() = megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param); Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
...@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp ...@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
// result, retrive_from_cache = true, allow_log = true // result, retrive_from_cache = true, allow_log = true
typename AlgoChooser<Opr>::ImplExecutionPolicy policy; typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
construct_execution_policy(selected_strategy, policy); construct_execution_policy(selected_strategy, policy);
return policy; if (policy.algo.valid())
return policy;
return choose_by_heuristic(selected_strategy);
MIDOUT_E MIDOUT_E
} }
...@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( ...@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str(), Algorithm::attribute_str(target_attr.first).c_str(),
Algorithm::attribute_str(target_attr.second).c_str()); Algorithm::attribute_str(target_attr.second).c_str());
mgb_log_warn( mgb_log_debug(
"No algo get from cache for %s. This may caused by " "No algo get from cache for %s. This may caused by "
"mismatch with model and cache file or imcomplete " "mismatch with model and cache file or imcomplete "
"cache file. ex. profiling with version1, but " "cache file. ex. profiling with version1, but "
...@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper: ...@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
if (!rst.valid()) if (!rst.valid())
return None; return None;
// subprocess will return dbl_max when meomry limit is not satisfied
if (rst.val().time == std::numeric_limits<double>::max())
return None;
std::string algo_desc; std::string algo_desc;
serialize_write_pod(policy.algo, algo_desc); serialize_write_pod(policy.algo, algo_desc);
return AlgoChooserProfileCache::ResultEntry{ return AlgoChooserProfileCache::ResultEntry{
...@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
auto&& rst = get_profile_result_from_cache(selected_strategy); auto&& rst = get_profile_result_from_cache(selected_strategy);
// rst.first.valid means there exists valid algorithms for current opr, just return // rst.first.valid means there exists valid algorithms for current opr, just return
// otherwise need to profile // otherwise need to profile
// in order to avoid reprofile in fastrun
if (rst.first.valid()) if (rst.first.valid())
return; return;
AlgoChooserProfileCache::Result prof_rst; AlgoChooserProfileCache::Result prof_rst;
...@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
double cur_timeout = 0; double cur_timeout = 0;
size_t data_size = 0;
for (auto ly : m_fastrun_layouts)
data_size += ly.span().dist_byte();
auto workspace_limit = auto workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
RealTimer timer; RealTimer timer;
...@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
ImplExecutionPolicy policy; ImplExecutionPolicy policy;
policy.algo = algo.desc; policy.algo = algo.desc;
// skip naive algo, can not using attribute to determine naive algo, thus using
// strcmp
if (algo.desc.name.compare("NAIVE") == 0) {
continue;
}
//! check negative attribute : skip negative attribute //! check negative attribute : skip negative attribute
auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
if (palgo->contain_attribute_any(target_attr.second)) { if (palgo->contain_attribute_any(target_attr.second)) {
...@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
//! check workspace limit //! check workspace limit
construct_execution_policy(selected_strategy, policy); construct_execution_policy(selected_strategy, policy);
mgb_assert( // this will failed
policy.algo.valid(), // when construct matmul algorithm for convolution opr
"construct execution policy must success when profiling"); if (!policy.algo.valid())
if (get_workspace_size_bytes(policy) > workspace_limit) { continue;
size_t workspace_needed = get_workspace_size_bytes(policy);
if (data_size + workspace_needed >
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
continue; continue;
} }
...@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
}) })
// megbrain uncatched exception // megbrain uncatched exception
MGB_CATCH(..., { MGB_CATCH(..., {
mgb_log_warn("caught exception during %s", msg.c_str()); mgb_log_debug("caught exception during %s", msg.c_str());
continue; continue;
}) })
if (!cur_rst.valid()) { if (!cur_rst.valid()) {
...@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
"workspace limite requirement(%zu)", "workspace limite requirement(%zu)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); // allowed to have empty profile result for current opr
// append some previous profiled results // append some previous profiled results
if (rst.second.valid()) if (rst.second.valid())
prof_rst.insert( prof_rst.insert(
prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
FixedTensorLayouts incache_layouts = m_incache_layouts; if (!prof_rst.empty()) {
typename Opr::Param origin_param = m_dnn_opr->param(); FixedTensorLayouts incache_layouts = m_incache_layouts;
AlgoChooserProfileCache::Key cache_key{ typename Opr::Param origin_param = m_dnn_opr->param();
incache_layouts.data(), incache_layouts.size(), &origin_param, AlgoChooserProfileCache::Key cache_key{
sizeof(origin_param)}; incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst); AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
}
MIDOUT_E MIDOUT_E
} }
......
...@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
} }
}); });
{ megdnn::Algorithm* algo =
// first allocate a whole chunk to avoid memory fragmentation (here we megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
// rely on memory allocator to reuse memory) mgb_assert(algo);
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align; #if !MGB_BUILD_SLIM_SERVING
for (int i = 0; i < arity; ++i) { #if MGB_CUDA || MGB_ROCM
tot_size += layouts[i].span().high_byte + align; // if tot_size > workspace_limit, then skip current algo, return double_max
} // this assertion is needed because when profiling algo with subprocess,
for (const auto& layout : preprocessed_layout) { // child process would occupy some cuda memory for initialization
tot_size += layout.span().high_byte + align; // this assertion is the most accurate than before
} size_t workspace_limit =
tot_size += param.workspace; std::max(cn.get_free_mem(), cn.get_max_block_size_available());
DeviceTensorStorage storage{cn}; auto align = cn.get_mem_addr_alignment();
storage.ensure_size(tot_size); size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
if (tot_size > workspace_limit) {
mgb_log_debug(
"current memory is not enouugh when profiling algo %s\n", algo->name());
return TResult::from_pod(Result{std::numeric_limits<double>::max()});
} }
#endif
#endif
// allocate input and output memory // allocate input and output memory
std::array<DeviceTensorND, arity_in> inp_val; std::array<DeviceTensorND, arity_in> inp_val;
...@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}); });
ev_end->record(); ev_end->record();
megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5; double next_report_time = 0.5;
while (!ev_end->finished()) { while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) { if (timer.get_secs() >= next_report_time) {
#if MGB_ENABLE_GETENV #if MGB_ENABLE_GETENV
mgb_log_debug( mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs" "profiling algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
algo->name(), timer.get_secs(), param.actual_timeout); algo->name(), timer.get_secs(), param.actual_timeout);
#else #else
mgb_log_debug( mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(), "profiling algo %s already took %.3f/%.3f secs", algo->name(),
timer.get_secs(), param.actual_timeout); timer.get_secs(), param.actual_timeout);
#endif #endif
next_report_time = timer.get_secs() + 1; next_report_time = timer.get_secs() + 1;
...@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
std::this_thread::sleep_for(1000us); std::this_thread::sleep_for(1000us);
#endif #endif
} }
DeviceTensorStorage storage;
for (int i = 0; i < arity_in; ++i) {
inp_val[i].reset(storage, TensorLayout{});
}
for (int i = 0; i < arity_out; ++i) {
out_val[i].reset(storage, TensorLayout{});
}
for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i].reset(storage, TensorLayout{});
}
mdn_workspace = megdnn::Workspace{};
workspace.reset(storage, TensorLayout{});
// release all free blocks owned by child process, // release all free blocks owned by child process,
// in order to avoid main process running out of memory // in order to avoid main process running out of memory
cn.try_coalesce_all_free_memory(); cn.try_coalesce_all_free_memory();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册