提交 b36b5bd8 编写于 作者: M Megvii Engine Team

refactor(mgb): check input when profiling

GitOrigin-RevId: 1d722dd7418a903d5de4df96f97a3b6033c41aff
上级 6c9b3a58
......@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions,
)
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import *
from .device import *
from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
......@@ -118,13 +117,6 @@ def _check_sm_version():
_check_sm_version()
_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
)
del _set_fork_exec_path_for_timed_func
_exit_handlers = []
......
......@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
__compute_mode = "default"
_benchmark_kernel = False
_deterministic_kernel = False
_benchmark_with_subprocess = False
__all__ = [
"benchmark_kernel",
"benchmark_with_subprocess",
"deterministic_kernel",
"async_level",
"disable_memory_forwarding",
......@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool):
_deterministic_kernel = option
@property
def benchmark_with_subprocess(mod):
r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
which means use heuristic to choose the fastest algorithm.
Examples:
.. code-block::
import megengine as mge
mge.config.benchmark_with_subprocess = True
"""
return _benchmark_with_subprocess
@benchmark_with_subprocess.setter
def benchmark_with_subprocess(mod, option: bool):
if option:
import sys
from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func
_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(
os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
),
)
@property
def async_level(mod) -> int:
r"""Get or set config whether raise error exactly when invoking op. The default level is 2,
......
......@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
return iter->second.direct_call(param);
if (!m_fork_exec_impl) {
mgb_log_warn(
mgb_log_debug(
"timeout is set, but no fork_exec_impl not given; "
"timeout would be ignored");
return iter->second.direct_call(param);
......
......@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
FOREACH_OPR_TYPE_DISPATCH(search_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
// skip different sub opr, for example:
// skip matmul algo when profiling convolution
if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
continue;
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
......@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
// result, retrive_from_cache = true, allow_log = true
typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
construct_execution_policy(selected_strategy, policy);
return policy;
if (policy.algo.valid())
return policy;
return choose_by_heuristic(selected_strategy);
MIDOUT_E
}
......@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str(),
Algorithm::attribute_str(target_attr.second).c_str());
mgb_log_warn(
mgb_log_debug(
"No algo get from cache for %s. This may caused by "
"mismatch with model and cache file or imcomplete "
"cache file. ex. profiling with version1, but "
......@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
if (!rst.valid())
return None;
// subprocess will return dbl_max when meomry limit is not satisfied
if (rst.val().time == std::numeric_limits<double>::max())
return None;
std::string algo_desc;
serialize_write_pod(policy.algo, algo_desc);
return AlgoChooserProfileCache::ResultEntry{
......@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
auto&& rst = get_profile_result_from_cache(selected_strategy);
// rst.first.valid means there exists valid algorithms for current opr, just return
// otherwise need to profile
// in order to avoid reprofile in fastrun
if (rst.first.valid())
return;
AlgoChooserProfileCache::Result prof_rst;
......@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
double cur_timeout = 0;
size_t data_size = 0;
for (auto ly : m_fastrun_layouts)
data_size += ly.span().dist_byte();
auto workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
RealTimer timer;
......@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
ImplExecutionPolicy policy;
policy.algo = algo.desc;
// skip naive algo, can not using attribute to determine naive algo, thus using
// strcmp
if (algo.desc.name.compare("NAIVE") == 0) {
continue;
}
//! check negative attribute : skip negative attribute
auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
if (palgo->contain_attribute_any(target_attr.second)) {
......@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
//! check workspace limit
construct_execution_policy(selected_strategy, policy);
mgb_assert(
policy.algo.valid(),
"construct execution policy must success when profiling");
if (get_workspace_size_bytes(policy) > workspace_limit) {
// this will failed
// when construct matmul algorithm for convolution opr
if (!policy.algo.valid())
continue;
size_t workspace_needed = get_workspace_size_bytes(policy);
if (data_size + workspace_needed >
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
continue;
}
......@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
})
// megbrain uncatched exception
MGB_CATCH(..., {
mgb_log_warn("caught exception during %s", msg.c_str());
mgb_log_debug("caught exception during %s", msg.c_str());
continue;
})
if (!cur_rst.valid()) {
......@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
"workspace limite requirement(%zu)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
// allowed to have empty profile result for current opr
// append some previous profiled results
if (rst.second.valid())
prof_rst.insert(
prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
FixedTensorLayouts incache_layouts = m_incache_layouts;
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
if (!prof_rst.empty()) {
FixedTensorLayouts incache_layouts = m_incache_layouts;
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
}
MIDOUT_E
}
......
......@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}
});
{
// first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory)
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
DeviceTensorStorage storage{cn};
storage.ensure_size(tot_size);
megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);
#if !MGB_BUILD_SLIM_SERVING
#if MGB_CUDA || MGB_ROCM
// if tot_size > workspace_limit, then skip current algo, return double_max
// this assertion is needed because when profiling algo with subprocess,
// child process would occupy some cuda memory for initialization
// this assertion is the most accurate than before
size_t workspace_limit =
std::max(cn.get_free_mem(), cn.get_max_block_size_available());
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
if (tot_size > workspace_limit) {
mgb_log_debug(
"current memory is not enouugh when profiling algo %s\n", algo->name());
return TResult::from_pod(Result{std::numeric_limits<double>::max()});
}
#endif
#endif
// allocate input and output memory
std::array<DeviceTensorND, arity_in> inp_val;
......@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
});
ev_end->record();
megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5;
while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) {
#if MGB_ENABLE_GETENV
mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs"
"profiling algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
algo->name(), timer.get_secs(), param.actual_timeout);
#else
mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
"profiling algo %s already took %.3f/%.3f secs", algo->name(),
timer.get_secs(), param.actual_timeout);
#endif
next_report_time = timer.get_secs() + 1;
......@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
std::this_thread::sleep_for(1000us);
#endif
}
DeviceTensorStorage storage;
for (int i = 0; i < arity_in; ++i) {
inp_val[i].reset(storage, TensorLayout{});
}
for (int i = 0; i < arity_out; ++i) {
out_val[i].reset(storage, TensorLayout{});
}
for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i].reset(storage, TensorLayout{});
}
mdn_workspace = megdnn::Workspace{};
workspace.reset(storage, TensorLayout{});
// release all free blocks owned by child process,
// in order to avoid main process running out of memory
cn.try_coalesce_all_free_memory();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册