/** * \file src/opr/impl/search_policy/profile.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megbrain/opr/search_policy/profiler.h" #include "../internal/invoke.h" //! TODO: here has to be know some megdnn::opr when there is produced midout.h //! fix it if there is another graceful way. #include "megdnn/oprs.h" #include "midout.h" MIDOUT_DECL(megbrain_opr_profile) #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) { #define MIDOUT_E \ } \ MIDOUT_END(); namespace mgb { namespace opr { #define APPLY(statement, ...) \ mgb::apply([&](const auto&... args) { return statement; }, \ std::tuple_cat(__VA_ARGS__)) template const double TimedProfiler::timeout_setting = TimedProfiler::init_timeout_setting(); template double TimedProfiler::init_timeout_setting() { #if MGB_ENABLE_FASTRUN sys::TimedFuncInvoker::ins().register_func( AlgoChooserFuncId::ID, &TimedProfiler::prof_impl, &TimedProfiler::prof_init_device); auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT"); if (to_set) return std::stod(to_set); #endif return 0; } #define APPLY(statement, ...) \ mgb::apply([&](const auto&... args) { return statement; }, \ std::tuple_cat(__VA_ARGS__)) template typename TimedProfiler::TResult TimedProfiler::prof_impl( const TParam& raw_param) { MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl"))) auto&& param = raw_param.as_single_pod(); CompNode cn = CompNode::load(param.comp_node_loc, param.comp_node_loc); auto megdnn_opr = intl::create_megdnn_opr(cn); std::array layouts; auto from_enum = [&](DTypeEnum enumv) -> DType { switch (enumv) { #define cb(_dt) \ case DTypeTrait<_dt>::enumv: \ return _dt(1.0f, static_cast(0)) cb(dtype::Quantized8Asymm); #undef cb #define cb(_dt) \ case DTypeTrait<_dt>::enumv: \ return _dt(1.0f) cb(dtype::QuantizedS8); cb(dtype::QuantizedS16); cb(dtype::QuantizedS32); default: return DType::from_enum(enumv); #undef cb } }; for (int i = 0; i < arity; ++i) { layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])}; } megdnn_opr->param() = param.opr_param; { typename Opr::Algorithm* algo = nullptr; for (auto i : APPLY(megdnn_opr->get_all_algorithms(args...), layouts)) { if (!strcmp(i->name(), param.algo_name)) { algo = i; break; } } mgb_assert(algo, "algorithm %s not found", param.algo_name); megdnn_opr->execution_policy() = {algo}; } // Allocate preprocessed weight buffers. TensorLayoutArray preprocessed_layout; if_constexpr()>([&](auto _) { if (param.allow_weight_preprocess) { preprocessed_layout = APPLY( _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts); } }); { // first allocate a whole chunk to avoid memory fragmentation (here we // rely on memory allocator to reuse memory) auto align = cn.get_mem_addr_alignment(); size_t tot_size = align; for (int i = 0; i < arity; ++i) { tot_size += layouts[i].span().high_byte + align; } for (const auto& layout : preprocessed_layout) { tot_size += layout.span().high_byte + align; } tot_size += param.workspace; DeviceTensorStorage storage{cn}; storage.ensure_size(tot_size); } // allocate input and output memory std::array inp_val; std::array out_val; DeviceTensorND workspace; for (int i = 0; i < arity_in; ++i) { inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); } for (int i = 0; i < arity_out; ++i) { out_val[i] .comp_node(cn) .dtype(layouts[arity_in + i].dtype) .resize(layouts[arity_in + i]); } megdnn::Workspace mdn_workspace; // allocate workspace if (param.workspace) { workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); mdn_workspace.size = param.workspace; mdn_workspace.raw_ptr = workspace.raw_ptr(); } // allocate storage for preprocessed filter SmallVector flt_val(preprocessed_layout.size()); for (size_t i = 0; i < preprocessed_layout.size(); i++) { flt_val[i] = {cn, preprocessed_layout[i], preprocessed_layout[i].dtype, preprocessed_layout[i].format}; } for (int i = 0; i < arity_in; ++i) { fill_zero_dev_tensor(inp_val[i]); } PreprocessFilter prep_flt; if_constexpr()>([&](auto _) { if (!preprocessed_layout.empty()) { auto&& pf = _(prep_flt); pf.algorithm_id = nullptr; pf.tensors.resize(flt_val.size()); for (size_t i = 0; i < flt_val.size(); i++) { pf.tensors[i] = flt_val[i].as_megdnn(); } APPLY(_(megdnn_opr)->exec_preprocess(args..., &pf, mdn_workspace), std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), array_skip<2>(layouts)); } }); RealTimer timer; auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), ev_end = cn.create_event(CompNode::Event::NEED_TIMER); ev_start->record(); if_constexpr()>( [&](auto _) { auto&& opr = _(megdnn_opr); PreprocessFilter* pf = preprocessed_layout.empty() ? nullptr : &prep_flt; APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, out_val); }, /* else */ [&](auto _) { APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, out_val); }); ev_end->record(); double next_report_time = 0.5; while (!ev_end->finished()) { if (timer.get_secs() >= next_report_time) { mgb_log_warn( "profiling conv algo %s already took %.3f/%.3f secs" " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", param.algo_name, timer.get_secs(), param.actual_timeout); next_report_time = timer.get_secs() + 1; } using namespace std::literals; std::this_thread::sleep_for(1000us); } mgb_assert(ev_start->finished()); return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)}); MIDOUT_E }; template Maybe::Result> TimedProfiler::profile( const Param& param, double& timeout) { mgb_assert(timeout >= 0); if (!timeout) { timeout = timeout_setting; } else if (timeout_setting) { timeout = std::min(timeout, timeout_setting); } param.actual_timeout = timeout ? timeout : std::numeric_limits::infinity(); auto res = sys::TimedFuncInvoker::ins().invoke( AlgoChooserFuncId::ID, TParam::from_pod(const_cast(param)), timeout); if (res.valid()) return res.val().template as_single_pod(); return None; } template void TimedProfiler::prof_init_device(const TParam& raw_param) { MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device"))) auto&& param = raw_param.as_single_pod(); CompNode cn = CompNode::load(param.comp_node_loc, param.comp_node_loc); // wait for cuda init, so its time does not get accounted in timeout cn.sync(); MIDOUT_E } #define INST(Opr) \ template const double TimedProfiler::timeout_setting; \ template double TimedProfiler::init_timeout_setting(); \ template typename TimedProfiler::TResult \ TimedProfiler::prof_impl(const TParam& raw_param); \ template Maybe::Result> \ TimedProfiler::profile(const Param& param, double& timeout); \ template void TimedProfiler::prof_init_device( \ const TParam& raw_param); MGB_FOREACH_FASTRUN_OPR(INST) #undef INST } // namespace opr } // namespace mgb // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}