diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py
index 9ef038c78c8eb5c6f7d08479453b15ceb3e269ac..093f22aaed5d4dfaee477dfba6d2459e64ae083e 100644
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
 from .core._imperative_rt.common import (
     get_supported_sm_versions as _get_supported_sm_versions,
 )
-from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .config import *
 from .device import *
 from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
@@ -118,13 +117,6 @@ def _check_sm_version():
 
 _check_sm_version()
 
-_set_fork_exec_path_for_timed_func(
-    sys.executable,
-    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
-)
-
-del _set_fork_exec_path_for_timed_func
-
 _exit_handlers = []
 
 
diff --git a/imperative/python/megengine/core/_config.py b/imperative/python/megengine/core/_config.py
index e756b572ba0c1fe29ca8eacbdd5952ee6faa0b51..00a740b8614ee67b1dd4dbc485f3224ff11b56eb 100644
--- a/imperative/python/megengine/core/_config.py
+++ b/imperative/python/megengine/core/_config.py
@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
 __compute_mode = "default"
 _benchmark_kernel = False
 _deterministic_kernel = False
+_benchmark_with_subprocess = False
 
 __all__ = [
     "benchmark_kernel",
+    "benchmark_with_subprocess",
     "deterministic_kernel",
     "async_level",
     "disable_memory_forwarding",
@@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool):
     _deterministic_kernel = option
 
 
+@property
+def benchmark_with_subprocess(mod):
+    r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
+    which means use heuristic to choose the fastest algorithm.
+    
+    Examples:    
+        .. code-block::
+
+           import megengine as mge
+           mge.config.benchmark_with_subprocess = True
+    """
+    return _benchmark_with_subprocess
+
+
+@benchmark_with_subprocess.setter
+def benchmark_with_subprocess(mod, option: bool):
+    if option:
+        import sys
+        from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func
+
+        _set_fork_exec_path_for_timed_func(
+            sys.executable,
+            os.path.join(
+                os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
+            ),
+        )
+
+
 @property
 def async_level(mod) -> int:
     r"""Get or set config whether raise error exactly when invoking op. The default level is 2,
diff --git a/src/core/impl/system.cpp b/src/core/impl/system.cpp
index 99d21b4a07dd873e4c704ff571299ed8e1378a98..b1677e7ad51515a9fa1160351d1bc3d5622d09b3 100644
--- a/src/core/impl/system.cpp
+++ b/src/core/impl/system.cpp
@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
             return iter->second.direct_call(param);
 
         if (!m_fork_exec_impl) {
-            mgb_log_warn(
+            mgb_log_debug(
                     "timeout is set, but no fork_exec_impl not given; "
                     "timeout would be ignored");
             return iter->second.direct_call(param);
diff --git a/src/rdnn/impl/algo_chooser.cpp b/src/rdnn/impl/algo_chooser.cpp
index ba85f948fcb54ed26ebbf3222f869cec0ed2261c..7c90f6250f83db24e4af32c261df86af26bb3601 100644
--- a/src/rdnn/impl/algo_chooser.cpp
+++ b/src/rdnn/impl/algo_chooser.cpp
@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
         auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
         FOREACH_OPR_TYPE_DISPATCH(search_items, {
             auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
+            // skip different sub opr, for example:
+            // skip matmul algo when profiling convolution
+            if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
+                continue;
             megdnn_opr->param() =
                     Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
             typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
     // result, retrive_from_cache = true, allow_log = true
     typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
     construct_execution_policy(selected_strategy, policy);
-    return policy;
+    if (policy.algo.valid())
+        return policy;
+    return choose_by_heuristic(selected_strategy);
     MIDOUT_E
 }
 
@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
                             ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
                             Algorithm::attribute_str(target_attr.first).c_str(),
                             Algorithm::attribute_str(target_attr.second).c_str());
-                    mgb_log_warn(
+                    mgb_log_debug(
                             "No algo get from cache for %s. This may caused by "
                             "mismatch with model and cache file or imcomplete "
                             "cache file. ex. profiling with version1, but "
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
     if (!rst.valid())
         return None;
 
+    // subprocess will return dbl_max when meomry limit is not satisfied
+    if (rst.val().time == std::numeric_limits<double>::max())
+        return None;
+
     std::string algo_desc;
     serialize_write_pod(policy.algo, algo_desc);
     return AlgoChooserProfileCache::ResultEntry{
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
     auto&& rst = get_profile_result_from_cache(selected_strategy);
     // rst.first.valid means there exists valid algorithms for current opr, just return
     // otherwise need to profile
+    // in order to avoid reprofile in fastrun
     if (rst.first.valid())
         return;
     AlgoChooserProfileCache::Result prof_rst;
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
     std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
     double cur_timeout = 0;
 
+    size_t data_size = 0;
+    for (auto ly : m_fastrun_layouts)
+        data_size += ly.span().dist_byte();
+
     auto workspace_limit =
             m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
     RealTimer timer;
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
         ImplExecutionPolicy policy;
         policy.algo = algo.desc;
 
+        // skip naive algo, can not using attribute to determine naive algo, thus using
+        // strcmp
+        if (algo.desc.name.compare("NAIVE") == 0) {
+            continue;
+        }
+
         //! check negative attribute : skip negative attribute
         auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
         if (palgo->contain_attribute_any(target_attr.second)) {
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
 
         //! check workspace limit
         construct_execution_policy(selected_strategy, policy);
-        mgb_assert(
-                policy.algo.valid(),
-                "construct execution policy must success when profiling");
-        if (get_workspace_size_bytes(policy) > workspace_limit) {
+        // this will failed
+        // when construct matmul algorithm for convolution opr
+        if (!policy.algo.valid())
+            continue;
+        size_t workspace_needed = get_workspace_size_bytes(policy);
+        if (data_size + workspace_needed >
+            m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
             continue;
         }
 
@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
         })
         // megbrain uncatched exception
         MGB_CATCH(..., {
-            mgb_log_warn("caught exception during %s", msg.c_str());
+            mgb_log_debug("caught exception during %s", msg.c_str());
             continue;
         })
         if (!cur_rst.valid()) {
@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
             "workspace limite requirement(%zu)",
             ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
             Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
-    mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
+    // allowed to have empty profile result for current opr
 
     // append some previous profiled results
     if (rst.second.valid())
         prof_rst.insert(
                 prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
-    FixedTensorLayouts incache_layouts = m_incache_layouts;
-    typename Opr::Param origin_param = m_dnn_opr->param();
-    AlgoChooserProfileCache::Key cache_key{
-            incache_layouts.data(), incache_layouts.size(), &origin_param,
-            sizeof(origin_param)};
-
-    AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
-    cache.put(cache_key, prof_rst);
+    if (!prof_rst.empty()) {
+        FixedTensorLayouts incache_layouts = m_incache_layouts;
+        typename Opr::Param origin_param = m_dnn_opr->param();
+        AlgoChooserProfileCache::Key cache_key{
+                incache_layouts.data(), incache_layouts.size(), &origin_param,
+                sizeof(origin_param)};
+
+        AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
+        cache.put(cache_key, prof_rst);
+    }
     MIDOUT_E
 }
 
diff --git a/src/rdnn/impl/profiler.cpp b/src/rdnn/impl/profiler.cpp
index b0b1c2741fcb44401224bd423b04aae26a79d469..accf586efdc2beb5699bccfb317d2789496c4b5f 100644
--- a/src/rdnn/impl/profiler.cpp
+++ b/src/rdnn/impl/profiler.cpp
@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
         }
     });
 
-    {
-        // first allocate a whole chunk to avoid memory fragmentation (here we
-        // rely on memory allocator to reuse memory)
-        auto align = cn.get_mem_addr_alignment();
-        size_t tot_size = align;
-        for (int i = 0; i < arity; ++i) {
-            tot_size += layouts[i].span().high_byte + align;
-        }
-        for (const auto& layout : preprocessed_layout) {
-            tot_size += layout.span().high_byte + align;
-        }
-        tot_size += param.workspace;
-        DeviceTensorStorage storage{cn};
-        storage.ensure_size(tot_size);
+    megdnn::Algorithm* algo =
+            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
+    mgb_assert(algo);
+
+#if !MGB_BUILD_SLIM_SERVING
+#if MGB_CUDA || MGB_ROCM
+    // if tot_size > workspace_limit, then skip current algo, return double_max
+    // this assertion is needed because when profiling algo with subprocess,
+    // child process would occupy some cuda memory for initialization
+    // this assertion is the most accurate than before
+    size_t workspace_limit =
+            std::max(cn.get_free_mem(), cn.get_max_block_size_available());
+    auto align = cn.get_mem_addr_alignment();
+    size_t tot_size = align;
+    for (int i = 0; i < arity; ++i) {
+        tot_size += layouts[i].span().high_byte + align;
+    }
+    for (const auto& layout : preprocessed_layout) {
+        tot_size += layout.span().high_byte + align;
+    }
+    tot_size += param.workspace;
+    if (tot_size > workspace_limit) {
+        mgb_log_debug(
+                "current memory is not enouugh when profiling algo %s\n", algo->name());
+        return TResult::from_pod(Result{std::numeric_limits<double>::max()});
     }
+#endif
+#endif
 
     // allocate input and output memory
     std::array<DeviceTensorND, arity_in> inp_val;
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
             });
     ev_end->record();
 
-    megdnn::Algorithm* algo =
-            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
-    mgb_assert(algo);
     double next_report_time = 0.5;
     while (!ev_end->finished()) {
         if (timer.get_secs() >= next_report_time) {
 #if MGB_ENABLE_GETENV
             mgb_log_debug(
-                    "profiling conv algo %s already took %.3f/%.3f secs"
+                    "profiling algo %s already took %.3f/%.3f secs"
                     " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
                     algo->name(), timer.get_secs(), param.actual_timeout);
 #else
             mgb_log_debug(
-                    "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
+                    "profiling algo %s already took %.3f/%.3f secs", algo->name(),
                     timer.get_secs(), param.actual_timeout);
 #endif
             next_report_time = timer.get_secs() + 1;
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
         std::this_thread::sleep_for(1000us);
 #endif
     }
+
+    DeviceTensorStorage storage;
+    for (int i = 0; i < arity_in; ++i) {
+        inp_val[i].reset(storage, TensorLayout{});
+    }
+    for (int i = 0; i < arity_out; ++i) {
+        out_val[i].reset(storage, TensorLayout{});
+    }
+    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
+        flt_val[i].reset(storage, TensorLayout{});
+    }
+    mdn_workspace = megdnn::Workspace{};
+    workspace.reset(storage, TensorLayout{});
     // release all free blocks owned by child process,
     // in order to avoid main process running out of memory
     cn.try_coalesce_all_free_memory();