refactor(mgb): check input when profiling

GitOrigin-RevId: 1d722dd7418a903d5de4df96f97a3b6033c41aff

refactor(mgb): check input when profiling
GitOrigin-RevId: 1d722dd7418a903d5de4df96f97a3b6033c41aff
b36b5bd8 · Megvii Engine Team · 6c9b3a58 · b36b5bd8 · b36b5bd8 · b36b5bd8
5 changed file
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
 from .core._imperative_rt.common import (
    get_supported_sm_versions as _get_supported_sm_versions,
 )
-from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .config import *
 from .device import *
 from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
@@ -118,13 +117,6 @@ def _check_sm_version():
 _check_sm_version()
-_set_fork_exec_path_for_timed_func(
-    sys.executable,
-    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
-)
-del _set_fork_exec_path_for_timed_func
 _exit_handlers = []

--- a/imperative/python/megengine/core/_config.py
+++ b/imperative/python/megengine/core/_config.py
@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
 __compute_mode = "default"
 _benchmark_kernel = False
 _deterministic_kernel = False
+_benchmark_with_subprocess = False
 __all__ = [
    "benchmark_kernel",
+    "benchmark_with_subprocess",
    "deterministic_kernel",
    "async_level",
    "disable_memory_forwarding",
@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool):
    _deterministic_kernel = option
+@property
+def benchmark_with_subprocess(mod):
+    r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
+    which means use heuristic to choose the fastest algorithm.
+    Examples:    
+        .. code-block::
+           import megengine as mge
+           mge.config.benchmark_with_subprocess = True
+    """
+    return _benchmark_with_subprocess
+@benchmark_with_subprocess.setter
+def benchmark_with_subprocess(mod, option: bool):
+    if option:
+        import sys
+        from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func
+        _set_fork_exec_path_for_timed_func(
+            sys.executable,
+            os.path.join(
+                os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
+            ),
+        )
 @property
 def async_level(mod) -> int:
    r"""Get or set config whether raise error exactly when invoking op. The default level is 2,

--- a/src/core/impl/system.cpp
+++ b/src/core/impl/system.cpp
@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
            return iter->second.direct_call(param);
        if (!m_fork_exec_impl) {
-            mgb_log_warn(
+            mgb_log_debug(
                    "timeout is set, but no fork_exec_impl not given; "
                    "timeout would be ignored");
            return iter->second.direct_call(param);

--- a/src/rdnn/impl/algo_chooser.cpp
+++ b/src/rdnn/impl/algo_chooser.cpp
@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
        auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
        FOREACH_OPR_TYPE_DISPATCH(search_items, {
            auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
+            // skip different sub opr, for example:
+            // skip matmul algo when profiling convolution
+            if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
+                continue;
            megdnn_opr->param() =
                    Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
            typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
    // result, retrive_from_cache = true, allow_log = true
    typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
    construct_execution_policy(selected_strategy, policy);
-    return policy;
+    if (policy.algo.valid())
+        return policy;
+    return choose_by_heuristic(selected_strategy);
    MIDOUT_E
 }
@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
                            ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
                            Algorithm::attribute_str(target_attr.first).c_str(),
                            Algorithm::attribute_str(target_attr.second).c_str());
-                    mgb_log_warn(
+                    mgb_log_debug(
                            "No algo get from cache for %s. This may caused by "
                            "mismatch with model and cache file or imcomplete "
                            "cache file. ex. profiling with version1, but "
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
    if (!rst.valid())
        return None;
+    // subprocess will return dbl_max when meomry limit is not satisfied
+    if (rst.val().time == std::numeric_limits<double>::max())
+        return None;
    std::string algo_desc;
    serialize_write_pod(policy.algo, algo_desc);
    return AlgoChooserProfileCache::ResultEntry{
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
    auto&& rst = get_profile_result_from_cache(selected_strategy);
    // rst.first.valid means there exists valid algorithms for current opr, just return
    // otherwise need to profile
+    // in order to avoid reprofile in fastrun
    if (rst.first.valid())
        return;
    AlgoChooserProfileCache::Result prof_rst;
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
    std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
    double cur_timeout = 0;
+    size_t data_size = 0;
+    for (auto ly : m_fastrun_layouts)
+        data_size += ly.span().dist_byte();
    auto workspace_limit =
            m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
    RealTimer timer;
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
        ImplExecutionPolicy policy;
        policy.algo = algo.desc;
+        // skip naive algo, can not using attribute to determine naive algo, thus using
+        // strcmp
+        if (algo.desc.name.compare("NAIVE") == 0) {
+            continue;
+        }
        //! check negative attribute : skip negative attribute
        auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
        if (palgo->contain_attribute_any(target_attr.second)) {
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
        //! check workspace limit
        construct_execution_policy(selected_strategy, policy);
-        mgb_assert(
+        // this will failed
-                policy.algo.valid(),
+        // when construct matmul algorithm for convolution opr
-                "construct execution policy must success when profiling");
+        if (!policy.algo.valid())
-        if (get_workspace_size_bytes(policy) > workspace_limit) {
+            continue;
+        size_t workspace_needed = get_workspace_size_bytes(policy);
+        if (data_size + workspace_needed >
+            m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
            continue;
        }
@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
        })
        // megbrain uncatched exception
        MGB_CATCH(..., {
-            mgb_log_warn("caught exception during %s", msg.c_str());
+            mgb_log_debug("caught exception during %s", msg.c_str());
            continue;
        })
        if (!cur_rst.valid()) {
@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
            "workspace limite requirement(%zu)",
            ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
            Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
-    mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
+    // allowed to have empty profile result for current opr
    // append some previous profiled results
    if (rst.second.valid())
        prof_rst.insert(
                prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
-    FixedTensorLayouts incache_layouts = m_incache_layouts;
+    if (!prof_rst.empty()) {
-    typename Opr::Param origin_param = m_dnn_opr->param();
+        FixedTensorLayouts incache_layouts = m_incache_layouts;
-    AlgoChooserProfileCache::Key cache_key{
+        typename Opr::Param origin_param = m_dnn_opr->param();
-            incache_layouts.data(), incache_layouts.size(), &origin_param,
+        AlgoChooserProfileCache::Key cache_key{
-            sizeof(origin_param)};
+                incache_layouts.data(), incache_layouts.size(), &origin_param,
+                sizeof(origin_param)};
-    AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
-    cache.put(cache_key, prof_rst);
+        AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
+        cache.put(cache_key, prof_rst);
+    }
    MIDOUT_E
 }

--- a/src/rdnn/impl/profiler.cpp
+++ b/src/rdnn/impl/profiler.cpp
@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        }
    });
-    {
+    megdnn::Algorithm* algo =
-        // first allocate a whole chunk to avoid memory fragmentation (here we
+            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
-        // rely on memory allocator to reuse memory)
+    mgb_assert(algo);
-        auto align = cn.get_mem_addr_alignment();
-        size_t tot_size = align;
+#if !MGB_BUILD_SLIM_SERVING
-        for (int i = 0; i < arity; ++i) {
+#if MGB_CUDA || MGB_ROCM
-            tot_size += layouts[i].span().high_byte + align;
+    // if tot_size > workspace_limit, then skip current algo, return double_max
-        }
+    // this assertion is needed because when profiling algo with subprocess,
-        for (const auto& layout : preprocessed_layout) {
+    // child process would occupy some cuda memory for initialization
-            tot_size += layout.span().high_byte + align;
+    // this assertion is the most accurate than before
-        }
+    size_t workspace_limit =
-        tot_size += param.workspace;
+            std::max(cn.get_free_mem(), cn.get_max_block_size_available());
-        DeviceTensorStorage storage{cn};
+    auto align = cn.get_mem_addr_alignment();
-        storage.ensure_size(tot_size);
+    size_t tot_size = align;
+    for (int i = 0; i < arity; ++i) {
+        tot_size += layouts[i].span().high_byte + align;
+    }
+    for (const auto& layout : preprocessed_layout) {
+        tot_size += layout.span().high_byte + align;
+    }
+    tot_size += param.workspace;
+    if (tot_size > workspace_limit) {
+        mgb_log_debug(
+                "current memory is not enouugh when profiling algo %s\n", algo->name());
+        return TResult::from_pod(Result{std::numeric_limits<double>::max()});
    }
+#endif
+#endif
    // allocate input and output memory
    std::array<DeviceTensorND, arity_in> inp_val;
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
            });
    ev_end->record();
-    megdnn::Algorithm* algo =
-            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
-    mgb_assert(algo);
    double next_report_time = 0.5;
    while (!ev_end->finished()) {
        if (timer.get_secs() >= next_report_time) {
 #if MGB_ENABLE_GETENV
            mgb_log_debug(
-                    "profiling conv algo %s already took %.3f/%.3f secs"
+                    "profiling algo %s already took %.3f/%.3f secs"
                    " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
                    algo->name(), timer.get_secs(), param.actual_timeout);
 #else
            mgb_log_debug(
-                    "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
+                    "profiling algo %s already took %.3f/%.3f secs", algo->name(),
                    timer.get_secs(), param.actual_timeout);
 #endif
            next_report_time = timer.get_secs() + 1;
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        std::this_thread::sleep_for(1000us);
 #endif
    }
+    DeviceTensorStorage storage;
+    for (int i = 0; i < arity_in; ++i) {
+        inp_val[i].reset(storage, TensorLayout{});
+    }
+    for (int i = 0; i < arity_out; ++i) {
+        out_val[i].reset(storage, TensorLayout{});
+    }
+    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
+        flt_val[i].reset(storage, TensorLayout{});
+    }
+    mdn_workspace = megdnn::Workspace{};
+    workspace.reset(storage, TensorLayout{});
    // release all free blocks owned by child process,
    // in order to avoid main process running out of memory
    cn.try_coalesce_all_free_memory();