diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py index 9ef038c78c8eb5c6f7d08479453b15ceb3e269ac..093f22aaed5d4dfaee477dfba6d2459e64ae083e 100644 --- a/imperative/python/megengine/__init__.py +++ b/imperative/python/megengine/__init__.py @@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync from .core._imperative_rt.common import ( get_supported_sm_versions as _get_supported_sm_versions, ) -from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func from .config import * from .device import * from .logger import enable_debug_log, get_logger, set_log_file, set_log_level @@ -118,13 +117,6 @@ def _check_sm_version(): _check_sm_version() -_set_fork_exec_path_for_timed_func( - sys.executable, - os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), -) - -del _set_fork_exec_path_for_timed_func - _exit_handlers = [] diff --git a/imperative/python/megengine/core/_config.py b/imperative/python/megengine/core/_config.py index e756b572ba0c1fe29ca8eacbdd5952ee6faa0b51..00a740b8614ee67b1dd4dbc485f3224ff11b56eb 100644 --- a/imperative/python/megengine/core/_config.py +++ b/imperative/python/megengine/core/_config.py @@ -14,9 +14,11 @@ from ._imperative_rt.core2 import ( __compute_mode = "default" _benchmark_kernel = False _deterministic_kernel = False +_benchmark_with_subprocess = False __all__ = [ "benchmark_kernel", + "benchmark_with_subprocess", "deterministic_kernel", "async_level", "disable_memory_forwarding", @@ -71,6 +73,34 @@ def deterministic_kernel(mod, option: bool): _deterministic_kernel = option +@property +def benchmark_with_subprocess(mod): + r"""Whether or not run possible algorithms on real device to find the best one. The default option is false, + which means use heuristic to choose the fastest algorithm. + + Examples: + .. code-block:: + + import megengine as mge + mge.config.benchmark_with_subprocess = True + """ + return _benchmark_with_subprocess + + +@benchmark_with_subprocess.setter +def benchmark_with_subprocess(mod, option: bool): + if option: + import sys + from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func + + _set_fork_exec_path_for_timed_func( + sys.executable, + os.path.join( + os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py" + ), + ) + + @property def async_level(mod) -> int: r"""Get or set config whether raise error exactly when invoking op. The default level is 2, diff --git a/src/core/impl/system.cpp b/src/core/impl/system.cpp index 99d21b4a07dd873e4c704ff571299ed8e1378a98..b1677e7ad51515a9fa1160351d1bc3d5622d09b3 100644 --- a/src/core/impl/system.cpp +++ b/src/core/impl/system.cpp @@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker { return iter->second.direct_call(param); if (!m_fork_exec_impl) { - mgb_log_warn( + mgb_log_debug( "timeout is set, but no fork_exec_impl not given; " "timeout would be ignored"); return iter->second.direct_call(param); diff --git a/src/rdnn/impl/algo_chooser.cpp b/src/rdnn/impl/algo_chooser.cpp index ba85f948fcb54ed26ebbf3222f869cec0ed2261c..7c90f6250f83db24e4af32c261df86af26bb3601 100644 --- a/src/rdnn/impl/algo_chooser.cpp +++ b/src/rdnn/impl/algo_chooser.cpp @@ -595,6 +595,10 @@ typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelp auto&& search_items = flatten_search_space(*this, circular_deps_checker); FOREACH_OPR_TYPE_DISPATCH(search_items, { auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); + // skip different sub opr, for example: + // skip matmul algo when profiling convolution + if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type()) + continue; megdnn_opr->param() = Algorithm::deserialize_read_pod(_item.param); typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( @@ -609,7 +613,9 @@ typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelp // result, retrive_from_cache = true, allow_log = true typename AlgoChooser::ImplExecutionPolicy policy; construct_execution_policy(selected_strategy, policy); - return policy; + if (policy.algo.valid()) + return policy; + return choose_by_heuristic(selected_strategy); MIDOUT_E } @@ -712,7 +718,7 @@ void AlgoChooser::AlgoChooserHelper::construct_execution_policy( ::MegDNNOpr2Typename::name, layouts_str.c_str(), Algorithm::attribute_str(target_attr.first).c_str(), Algorithm::attribute_str(target_attr.second).c_str()); - mgb_log_warn( + mgb_log_debug( "No algo get from cache for %s. This may caused by " "mismatch with model and cache file or imcomplete " "cache file. ex. profiling with version1, but " @@ -876,6 +882,10 @@ Maybe AlgoChooser::AlgoChooserHelper: if (!rst.valid()) return None; + // subprocess will return dbl_max when meomry limit is not satisfied + if (rst.val().time == std::numeric_limits::max()) + return None; + std::string algo_desc; serialize_write_pod(policy.algo, algo_desc); return AlgoChooserProfileCache::ResultEntry{ @@ -893,6 +903,7 @@ void AlgoChooser::AlgoChooserHelper::profile( auto&& rst = get_profile_result_from_cache(selected_strategy); // rst.first.valid means there exists valid algorithms for current opr, just return // otherwise need to profile + // in order to avoid reprofile in fastrun if (rst.first.valid()) return; AlgoChooserProfileCache::Result prof_rst; @@ -901,6 +912,10 @@ void AlgoChooser::AlgoChooserHelper::profile( std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); double cur_timeout = 0; + size_t data_size = 0; + for (auto ly : m_fastrun_layouts) + data_size += ly.span().dist_byte(); + auto workspace_limit = m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); RealTimer timer; @@ -925,6 +940,12 @@ void AlgoChooser::AlgoChooserHelper::profile( ImplExecutionPolicy policy; policy.algo = algo.desc; + // skip naive algo, can not using attribute to determine naive algo, thus using + // strcmp + if (algo.desc.name.compare("NAIVE") == 0) { + continue; + } + //! check negative attribute : skip negative attribute auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); if (palgo->contain_attribute_any(target_attr.second)) { @@ -938,10 +959,13 @@ void AlgoChooser::AlgoChooserHelper::profile( //! check workspace limit construct_execution_policy(selected_strategy, policy); - mgb_assert( - policy.algo.valid(), - "construct execution policy must success when profiling"); - if (get_workspace_size_bytes(policy) > workspace_limit) { + // this will failed + // when construct matmul algorithm for convolution opr + if (!policy.algo.valid()) + continue; + size_t workspace_needed = get_workspace_size_bytes(policy); + if (data_size + workspace_needed > + m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { continue; } @@ -957,7 +981,7 @@ void AlgoChooser::AlgoChooserHelper::profile( }) // megbrain uncatched exception MGB_CATCH(..., { - mgb_log_warn("caught exception during %s", msg.c_str()); + mgb_log_debug("caught exception during %s", msg.c_str()); continue; }) if (!cur_rst.valid()) { @@ -982,20 +1006,22 @@ void AlgoChooser::AlgoChooserHelper::profile( "workspace limite requirement(%zu)", ::MegDNNOpr2Typename::name, layouts_str.c_str(), Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); - mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); + // allowed to have empty profile result for current opr // append some previous profiled results if (rst.second.valid()) prof_rst.insert( prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); - FixedTensorLayouts incache_layouts = m_incache_layouts; - typename Opr::Param origin_param = m_dnn_opr->param(); - AlgoChooserProfileCache::Key cache_key{ - incache_layouts.data(), incache_layouts.size(), &origin_param, - sizeof(origin_param)}; - - AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); - cache.put(cache_key, prof_rst); + if (!prof_rst.empty()) { + FixedTensorLayouts incache_layouts = m_incache_layouts; + typename Opr::Param origin_param = m_dnn_opr->param(); + AlgoChooserProfileCache::Key cache_key{ + incache_layouts.data(), incache_layouts.size(), &origin_param, + sizeof(origin_param)}; + + AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); + cache.put(cache_key, prof_rst); + } MIDOUT_E } diff --git a/src/rdnn/impl/profiler.cpp b/src/rdnn/impl/profiler.cpp index b0b1c2741fcb44401224bd423b04aae26a79d469..accf586efdc2beb5699bccfb317d2789496c4b5f 100644 --- a/src/rdnn/impl/profiler.cpp +++ b/src/rdnn/impl/profiler.cpp @@ -245,21 +245,34 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( } }); - { - // first allocate a whole chunk to avoid memory fragmentation (here we - // rely on memory allocator to reuse memory) - auto align = cn.get_mem_addr_alignment(); - size_t tot_size = align; - for (int i = 0; i < arity; ++i) { - tot_size += layouts[i].span().high_byte + align; - } - for (const auto& layout : preprocessed_layout) { - tot_size += layout.span().high_byte + align; - } - tot_size += param.workspace; - DeviceTensorStorage storage{cn}; - storage.ensure_size(tot_size); + megdnn::Algorithm* algo = + megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); + mgb_assert(algo); + +#if !MGB_BUILD_SLIM_SERVING +#if MGB_CUDA || MGB_ROCM + // if tot_size > workspace_limit, then skip current algo, return double_max + // this assertion is needed because when profiling algo with subprocess, + // child process would occupy some cuda memory for initialization + // this assertion is the most accurate than before + size_t workspace_limit = + std::max(cn.get_free_mem(), cn.get_max_block_size_available()); + auto align = cn.get_mem_addr_alignment(); + size_t tot_size = align; + for (int i = 0; i < arity; ++i) { + tot_size += layouts[i].span().high_byte + align; + } + for (const auto& layout : preprocessed_layout) { + tot_size += layout.span().high_byte + align; + } + tot_size += param.workspace; + if (tot_size > workspace_limit) { + mgb_log_debug( + "current memory is not enouugh when profiling algo %s\n", algo->name()); + return TResult::from_pod(Result{std::numeric_limits::max()}); } +#endif +#endif // allocate input and output memory std::array inp_val; @@ -334,20 +347,17 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( }); ev_end->record(); - megdnn::Algorithm* algo = - megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); - mgb_assert(algo); double next_report_time = 0.5; while (!ev_end->finished()) { if (timer.get_secs() >= next_report_time) { #if MGB_ENABLE_GETENV mgb_log_debug( - "profiling conv algo %s already took %.3f/%.3f secs" + "profiling algo %s already took %.3f/%.3f secs" " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", algo->name(), timer.get_secs(), param.actual_timeout); #else mgb_log_debug( - "profiling conv algo %s already took %.3f/%.3f secs", algo->name(), + "profiling algo %s already took %.3f/%.3f secs", algo->name(), timer.get_secs(), param.actual_timeout); #endif next_report_time = timer.get_secs() + 1; @@ -357,6 +367,19 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( std::this_thread::sleep_for(1000us); #endif } + + DeviceTensorStorage storage; + for (int i = 0; i < arity_in; ++i) { + inp_val[i].reset(storage, TensorLayout{}); + } + for (int i = 0; i < arity_out; ++i) { + out_val[i].reset(storage, TensorLayout{}); + } + for (size_t i = 0; i < preprocessed_layout.size(); i++) { + flt_val[i].reset(storage, TensorLayout{}); + } + mdn_workspace = megdnn::Workspace{}; + workspace.reset(storage, TensorLayout{}); // release all free blocks owned by child process, // in order to avoid main process running out of memory cn.try_coalesce_all_free_memory();