From 12a3ef8d019cfc63dfefe9437990859e6ade5414 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 9 Mar 2022 19:07:19 +0800 Subject: [PATCH] refactor(fastrun): decouple fastrun from computing graph GitOrigin-RevId: 27abd222950e36287784e58ad56d6819e9c4cb69 --- src/CMakeLists.txt | 4 +- .../include/megbrain/utils}/invoke.h | 0 .../opr_format_modifier.cpp | 2 +- src/gopt/impl/inference.cpp | 2 +- src/opr/impl/blas.cpp | 1 - src/opr/impl/dnn/convolution.cpp | 2 +- src/opr/impl/internal/megdnn_opr_wrapper.cpp | 48 - src/opr/impl/search_policy/algo_chooser.cpp | 1130 +---------------- src/opr/impl/search_policy/profiler.cpp | 413 ------ .../workspace_need_limit_getter.inl | 4 +- .../opr/internal/megdnn_opr_wrapper.h | 38 +- .../megbrain/opr/search_policy/algo_chooser.h | 147 +-- .../megbrain/opr/search_policy/profiler.h | 165 --- 13 files changed, 43 insertions(+), 1913 deletions(-) rename src/{opr/impl/internal => core/include/megbrain/utils}/invoke.h (100%) delete mode 100644 src/opr/impl/search_policy/profiler.cpp delete mode 100644 src/opr/include/megbrain/opr/search_policy/profiler.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 313118b9e..41e97c70d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ file( opr/impl/nvof/*.cpp plugin/impl/*.cpp serialization/impl/*.cpp + rdnn/impl/*.cpp core/impl/*.inl gopt/impl/*.inl opr/impl/*.inl @@ -53,7 +54,8 @@ set(MGB_INC ${CMAKE_CURRENT_LIST_DIR}/gopt/include ${CMAKE_CURRENT_LIST_DIR}/opr/include ${CMAKE_CURRENT_LIST_DIR}/plugin/include - ${CMAKE_CURRENT_LIST_DIR}/serialization/include) + ${CMAKE_CURRENT_LIST_DIR}/serialization/include + ${CMAKE_CURRENT_LIST_DIR}/rdnn/include) if(MGE_WITH_JIT) list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include) diff --git a/src/opr/impl/internal/invoke.h b/src/core/include/megbrain/utils/invoke.h similarity index 100% rename from src/opr/impl/internal/invoke.h rename to src/core/include/megbrain/utils/invoke.h diff --git a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp index 84d82e450..5f9e27228 100644 --- a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp +++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp @@ -183,7 +183,7 @@ struct OprWithPolicyMaker MakeOprWithPolicyCaller4, megdnn::param::BatchConvBias> {}; -#include "../../opr/impl/internal/invoke.h" +#include "megbrain/utils/invoke.h" template struct MultiAlgoOprTrait; diff --git a/src/gopt/impl/inference.cpp b/src/gopt/impl/inference.cpp index 2d0557b00..90365176f 100644 --- a/src/gopt/impl/inference.cpp +++ b/src/gopt/impl/inference.cpp @@ -23,8 +23,8 @@ #include "megbrain/opr/imgproc.h" #include "megbrain/opr/misc.h" #include "megbrain/opr/nn_int.h" +#include "megbrain/opr/search_policy/algo_chooser.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h" -#include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/tensor_gen.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/utility.h" diff --git a/src/opr/impl/blas.cpp b/src/opr/impl/blas.cpp index c8445415f..b641d4429 100644 --- a/src/opr/impl/blas.cpp +++ b/src/opr/impl/blas.cpp @@ -19,7 +19,6 @@ #include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/search_policy/algo_chooser.h" -#include "megbrain/opr/search_policy/profiler.h" #include "./internal/megdnn_opr_wrapper.inl" #include "./search_policy/workspace_need_limit_getter.inl" diff --git a/src/opr/impl/dnn/convolution.cpp b/src/opr/impl/dnn/convolution.cpp index 94907db6b..711d7a78b 100644 --- a/src/opr/impl/dnn/convolution.cpp +++ b/src/opr/impl/dnn/convolution.cpp @@ -18,11 +18,11 @@ #include "megbrain/graph/grad_impl.h" #include "megbrain/system.h" #include "megbrain/utils/hash_ct.h" +#include "megbrain/utils/invoke.h" #include "megbrain/utils/timer.h" #include "megdnn/oprs/utils.h" -#include "../internal/invoke.h" #include "../internal/megdnn_opr_wrapper.inl" #include "../search_policy/workspace_need_limit_getter.inl" diff --git a/src/opr/impl/internal/megdnn_opr_wrapper.cpp b/src/opr/impl/internal/megdnn_opr_wrapper.cpp index a39499a31..ca179599b 100644 --- a/src/opr/impl/internal/megdnn_opr_wrapper.cpp +++ b/src/opr/impl/internal/megdnn_opr_wrapper.cpp @@ -25,26 +25,6 @@ using namespace mixin; /* ================== global functions ================== */ namespace { -template -class MegDNNGlobalOprContainer final : public UserDataContainer::UserData { - MGB_TYPEINFO_OBJ_DECL; - - std::shared_ptr m_megdnn_handle; - std::unique_ptr m_opr; - -public: - MegDNNGlobalOprContainer(CompNode cn) - : m_megdnn_handle{get_megdnn_handle_shared(cn)}, - m_opr{m_megdnn_handle->create_operator()} { - mgb_assert(m_opr->is_thread_safe()); - } - - Opr* get() const { return m_opr.get(); } -}; - -template -MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer); - class TempStorageContainer final : public UserDataContainer::UserData { MGB_TYPEINFO_OBJ_DECL; @@ -55,34 +35,6 @@ public: MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer); } // anonymous namespace -std::shared_ptr intl::get_megdnn_handle_shared(CompNode comp_node) { - auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)); - return {handle.shared_from_this(), handle.handle()}; -} - -megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) { - return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle(); -} - -template -Opr* intl::get_megdnn_global_opr(CompNode comp_node) { - using T = MegDNNGlobalOprContainer; - auto maker = [comp_node]() { return std::make_shared(comp_node); }; - return CompNodeEnv::from_comp_node(comp_node).get_user_data(maker).get(); -} - -namespace mgb { -namespace opr { -namespace intl { -#define INST(o) template o* get_megdnn_global_opr(CompNode) -INST(megdnn::AddUpdate); -INST(megdnn::Relayout); -INST(megdnn::Checksum); -#undef INST -} // namespace intl -} // namespace opr -} // namespace mgb - DeviceTensorStorage& intl::get_temp_storage(ComputingGraph& graph, CompNode comp_node) { auto container = graph.options().user_data.get_user_data_or_create(); diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp index 49b1b71c3..5ee30dd66 100644 --- a/src/opr/impl/search_policy/algo_chooser.cpp +++ b/src/opr/impl/search_policy/algo_chooser.cpp @@ -10,1115 +10,25 @@ * implied. */ -#include "megbrain/opr/search_policy/algo_chooser.h" #include #include + #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/internal/megdnn_opr_wrapper.h" +#include "megbrain/opr/search_policy/algo_chooser.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h" -#include "megbrain/opr/search_policy/profiler.h" +#include "megbrain/utils/invoke.h" +#include "megdnn/heuristic_cache.h" -#include "../internal/invoke.h" #include "../internal/megdnn_opr_wrapper.inl" #include "./workspace_need_limit_getter.inl" -//! TODO: here has to be know some megdnn::opr when there is produced midout.h -//! fix it if there is another graceful way. -#include "megdnn/heuristic_cache.h" -#include "megdnn/opr_param_defs.h" -#include "megdnn/oprs.h" -#include "megdnn/oprs/base.h" -#include "midout.h" -MIDOUT_DECL(megbrain_opr_algo_chooser) -#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { -#define MIDOUT_E \ - } \ - MIDOUT_END(); - using mgb::opr::intl::WorkspaceLimitGetter; using namespace megdnn; using namespace mgb; -#define APPLY(statement, ...) \ - mgb::apply( \ - [&](const auto&... args) { return statement; }, \ - std::tuple_cat(__VA_ARGS__)) - -// timeout delta to be added with fastest known algorithm for new algos -constexpr double TIMEOUT_TOLERANCE = 2; - -#define CACHE_KEY_VERSION "v5" - -namespace { -template -std::string profile_name(Opr* opr) { - std::string ret = std::string(MegDNNOpr2MGBOpr::MGBOpr::typeinfo()->name) + - CACHE_KEY_VERSION; - ret.append(opr->get_algorithm_set_name()); - return ret; -} - -template -std::string format_fixlayouts( - const typename opr::AlgoChooser::FixedTensorLayouts& layouts, - size_t arity_in, size_t arity_out, const std::string& delimiter = " -> ") { - std::string ret; - if (arity_in) { - ret.append("("); - for (size_t i = 0; i < arity_in; ++i) { - if (i) { - ret.append(", "); - } - ret.append(layouts[i].to_string() + " "); - } - ret.append(")"); - } - if (arity_in && arity_out) { - ret.append(delimiter); - } - if (arity_out) { - ret.append("("); - for (size_t i = 0; i < arity_out; ++i) { - if (i) { - ret.append(", "); - } - ret.append(layouts[i + arity_in].to_string() + " "); - } - ret.append(")"); - } - return ret; -} - -/** - * \brief Check if the sub opr list has circular dependence. - */ -class CircularDepsChecker { - struct SearchItemStorage { - std::string data_hold; - size_t hash = 0; - - SearchItemStorage(const Algorithm::SearchItem& item) { - Algorithm::serialize_write_pod(item.opr_type, data_hold); - for (auto&& layout : item.layouts) { - data_hold += layout.serialize(); - } - data_hold += item.param; - } - - SearchItemStorage& init_hash() { - hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), 20201225); - return *this; - } - - bool operator==(const SearchItemStorage& rhs) const { - return data_hold == rhs.data_hold; - } - - struct Hash { - size_t operator()(const SearchItemStorage& s) const { return s.hash; } - }; - }; - std::unordered_set m_set; - -public: - void put(const megdnn::Algorithm::SearchItem& key) { - SearchItemStorage key_storage(key); - key_storage.init_hash(); - mgb_assert( - m_set.find(key_storage) == m_set.end(), - "Circular dependency during flatten search space"); - auto ret = m_set.insert(std::move(key_storage)); - mgb_assert(ret.second); - } - void remove(const megdnn::Algorithm::SearchItem& key) { - SearchItemStorage key_storage(key); - key_storage.init_hash(); - auto&& iter = m_set.find(key_storage); - mgb_assert(iter != m_set.end()); - m_set.erase(iter); - } -}; - -///////////////// OprTypeTrait ///////////////////////////// -template -struct OprFromOprTypeTrait; - -template -struct OprTypeFromOprTrait; - -#define cb(_opr_type, _opr) \ - template <> \ - struct OprFromOprTypeTrait { \ - using Opr = megdnn::_opr; \ - }; \ - template <> \ - struct OprTypeFromOprTrait { \ - constexpr static megdnn::Algorithm::OprType opr_type = \ - megdnn::Algorithm::OprType::_opr_type; \ - } - -cb(MATRIX_MUL_FORWARD, MatrixMulForward); -cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); -cb(CONVOLUTION_FORWARD, ConvolutionForward); -cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); -cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); -cb(CONVOLUTION3D_FORWARD, Convolution3DForward); -cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); -cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); -cb(LOCAL_SHARE_FORWARD, LocalShareForward); -cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); -cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); -cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); -cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); -cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); -cb(BATCH_CONV_FORWARD, BatchConvBiasForward); -cb(CONVBIAS_FORWARD, ConvBiasForward); -cb(POOLING_FORWARD, PoolingForward); -cb(POOLING_BACKWARD, PoolingBackward); - -#undef cb - -// clang-format off -#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ - cb(MATRIX_MUL_FORWARD, stmt) \ - cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ - cb(CONVOLUTION_FORWARD, stmt) \ - cb(CONVOLUTION_BACKWARD_DATA, stmt) \ - cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ - cb(CONVOLUTION3D_FORWARD, stmt) \ - cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ - cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ - cb(LOCAL_SHARE_FORWARD, stmt) \ - cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ - cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ - cb(DEFORMABLE_CONV_FORWARD, stmt) \ - cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ - cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ - cb(BATCH_CONV_FORWARD, stmt) \ - cb(CONVBIAS_FORWARD, stmt) \ - cb(POOLING_FORWARD, stmt) \ - cb(POOLING_BACKWARD, stmt) -// clang-format on - -#define _OPR_TYPE_CASE(_opr_type, _stmt) \ - case Algorithm::OprType::_opr_type: { \ - using _Opr = typename OprFromOprTypeTrait::Opr; \ - _stmt; \ - break; \ - } - -#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ - for (size_t _item_idx = 0; _item_idx < _search_items.size(); _item_idx++) { \ - auto&& _item = _search_items[_item_idx]; \ - switch (_item.opr_type) { \ - FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ - default: \ - mgb_throw(MegBrainError, "unknown opr_type"); \ - } \ - } - -template -TensorLayoutArray to_layout_array( - const typename opr::AlgoChooser::FixedTensorLayouts& layouts) { - TensorLayoutArray ret; - for (auto&& layout : layouts) { - ret.push_back(layout); - } - return ret; -} - -template -typename opr::AlgoChooser::FixedTensorLayouts to_fixed_layouts( - const TensorLayoutArray& layouts) { - typename opr::AlgoChooser::FixedTensorLayouts ret; - mgb_assert(ret.size() == layouts.size()); - size_t idx = 0; - for (auto&& layout : layouts) { - ret[idx++] = layout; - } - return ret; -} - -/** - * flatten search space in postorder traversal - * The subopr search construct a search tree - * - * A - * / \ - * B1B2 C - * / \ - * D1D2D3 E - * We use postorder traverse the search tree. - * D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A - */ -template -std::vector flatten_search_space( - const typename opr::AlgoChooser::AlgoChooserHelper& helper, - CircularDepsChecker& checker) { - auto&& search_item = megdnn::Algorithm::SearchItem{ - OprTypeFromOprTrait::opr_type, helper.param(), - to_layout_array(helper.fastrun_layouts())}; - checker.put(search_item); - std::vector ret; - for (auto algo_info : helper.get_all_candidates()) { - megdnn::Algorithm* algo = helper.get_algorithm_from_desc(algo_info.desc); - mgb_assert(algo, "Unknown algo description"); - std::vector&& sub_items = algo->get_subopr_list( - to_layout_array(helper.fastrun_layouts()), helper.megdnn_opr()); - - FOREACH_OPR_TYPE_DISPATCH(sub_items, { - auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(helper.comp_node()); - megdnn_opr->param() = - Algorithm::deserialize_read_pod(_item.param); - typename opr::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( - to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), - _item.param, helper.mgb_opr(), helper.comp_node(), - helper.execution_policy(), helper.allow_weight_preprocess()); - auto space = flatten_search_space<_Opr>(sub_helper, checker); - ret.insert(ret.end(), space.begin(), space.end()); - }); - } - ret.push_back(search_item); - checker.remove(search_item); - return ret; -} - -//! serialize a algo's desc to string. format is -//! handle_type|algo_type|size_of_param|size_of_name|string_of_param|string_of_name -static void serialize_write_pod(const Algorithm::Info::Desc& val, std::string& result) { - megdnn::Algorithm::serialize_write_pod(val.handle_type, result); - megdnn::Algorithm::serialize_write_pod(val.type, result); - uint32_t param_size = val.param.size(); - uint32_t name_size = val.name.size(); - megdnn::Algorithm::serialize_write_pod(param_size, result); - megdnn::Algorithm::serialize_write_pod(name_size, result); - megdnn::Algorithm::serialize_write_pod(val.param, result); - megdnn::Algorithm::serialize_write_pod(val.name, result); -} - -static Algorithm::Info::Desc deserialize_read_pod( - const std::string& data, size_t offset = 0) { - Algorithm::Info::Desc ret; -#define cb(_val, _type) \ - _val = megdnn::Algorithm::deserialize_read_pod<_type>(data.data(), offset); \ - offset += sizeof(_val) - - cb(ret.handle_type, megdnn::Handle::HandleType); - cb(ret.type, uint32_t); - - uint32_t param_size = 0; - uint32_t name_size = 0; - cb(param_size, uint32_t); - cb(name_size, uint32_t); - - if (param_size > 0) { - ret.param = megdnn::Algorithm::deserialize_read_pod(data, offset, param_size); - offset += param_size; - } - if (name_size > 0) { - ret.name = megdnn::Algorithm::deserialize_read_pod(data, offset, name_size); - offset += name_size; - } - return ret; -} - -} // namespace - namespace mgb { namespace opr { -template -class LayoutsModifier { - using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; - -public: - static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {} - -private: - //! index of batch in tensor, 3 for CHWN4 e.g. - static size_t index_of_batch(const typename Opr::Param&) { return 0; } - - //! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g. - static std::vector sm_indices_contain_batch; -}; -template -std::vector LayoutsModifier::sm_indices_contain_batch = {}; - -#define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ - template <> \ - class LayoutsModifier { \ - public: \ - using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; \ - static void on( \ - FixedTensorLayouts& layouts, const opr::Param& param, \ - size_t new_batch_size) { \ - size_t batch_index = index_of_batch(param); \ - for (size_t index : sm_indices_contain_batch) { \ - layouts.at(index)[batch_index] = new_batch_size; \ - } \ - } \ - \ - private: \ - static size_t index_of_batch(const opr::Param&) { return 0; } \ - static std::vector sm_indices_contain_batch; \ - }; \ - std::vector LayoutsModifier::sm_indices_contain_batch = idxs; - -DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::Convolution3DForward, (std::initializer_list{0, 2})) -DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::Convolution3DBackwardData, (std::initializer_list{1, 2})) -DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::Convolution3DBackwardFilter, (std::initializer_list{0, 1})) -DEFAULT_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::BatchedMatrixMul, (std::initializer_list{0, 1, 2})) -#undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST - -#define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \ - template <> \ - class LayoutsModifier { \ - public: \ - using FixedTensorLayouts = typename AlgoChooser::FixedTensorLayouts; \ - static void on( \ - FixedTensorLayouts& layouts, const opr::Param& param, \ - size_t new_batch_size) { \ - size_t batch_index = index_of_batch(param); \ - for (size_t index : sm_indices_contain_batch) { \ - layouts.at(index)[batch_index] = new_batch_size; \ - } \ - } \ - \ - private: \ - static size_t index_of_batch(const opr::Param& param) { \ - if (param.format == opr::Param::Format::CHWN4) { \ - return 3; \ - } \ - return 0; \ - } \ - static std::vector sm_indices_contain_batch; \ - }; \ - std::vector LayoutsModifier::sm_indices_contain_batch = idxs; - -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::ConvolutionForward, (std::initializer_list{0, 2})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::ConvolutionBackwardData, (std::initializer_list{1, 2})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::ConvolutionBackwardFilter, (std::initializer_list{0, 1})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::LocalShareForward, (std::initializer_list{0, 2})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::LocalShareBackwardData, (std::initializer_list{1, 2})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::LocalShareBackwardFilter, (std::initializer_list{0, 1})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::DeformableConvForward, (std::initializer_list{0, 2, 3, 4})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::DeformableConvBackwardData, - (std::initializer_list{0, 2, 3, 4, 5, 6, 7})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::DeformableConvBackwardFilter, - (std::initializer_list{0, 1, 2, 3})) -CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST( - megdnn::BatchConvBiasForward, (std::initializer_list{0, 1, 2, 3, 4})) -#undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST - -template <> -class LayoutsModifier { -public: - using FixedTensorLayouts = - typename AlgoChooser::FixedTensorLayouts; - static void on( - FixedTensorLayouts& layouts, const megdnn::ConvBiasForward::Param& param, - size_t new_batch_size) { - size_t batch_index = index_of_batch(param); - for (size_t index : sm_indices_contain_batch) { - layouts.at(index)[batch_index] = new_batch_size; - } - for (size_t index : sm_indices_contain_batch_broadcast) { - if (!check_bias_share_in_channel(layouts.at(index), param.format)) { - layouts.at(index)[batch_index] = new_batch_size; - } - } - } - -private: - static std::vector sm_indices_contain_batch; - static std::vector sm_indices_contain_batch_broadcast; - static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) { - if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) { - return 3; - } - return 0; - } -}; -std::vector LayoutsModifier::sm_indices_contain_batch = - {0, 3, 4}; -std::vector - LayoutsModifier::sm_indices_contain_batch_broadcast = { - 2}; - -template <> -class LayoutsModifier { -public: - using FixedTensorLayouts = - typename AlgoChooser::FixedTensorLayouts; - static void on( - FixedTensorLayouts& layouts, const megdnn::MatrixMul::Param& param, - size_t new_batch_size) { - //! Because we do not know whether the batch size is in the dimension m - //! or the dimension n, we just ignore both m and n here. - // FIXME Find a way to make mgb obtain batch size information from R or - // automatically - layouts.at(2)[0] = new_batch_size; - layouts.at(2)[1] = new_batch_size; - if (param.transposeA) { - layouts.at(0)[1] = new_batch_size; - } else { - layouts.at(0)[0] = new_batch_size; - } - if (param.transposeB) { - layouts.at(1)[0] = new_batch_size; - } else { - layouts.at(1)[1] = new_batch_size; - } - } -}; - -///////////////////////////// AlgoChooserHelper ////////////////////////// -template -AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( - const FixedTensorLayouts& layouts, Opr* megdnn_opr, - const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, - const CompNode& cn, const megdnn::param::ExecutionPolicy& execution_policy, - bool allow_weight_preprocess) - : m_fastrun_layouts{layouts}, - m_incache_layouts{layouts}, - m_dnn_opr{megdnn_opr}, - m_param{param_str}, - m_base_mgb_opr{mgb_opr}, - m_cn{cn}, - m_execution_policy{execution_policy}, - m_allow_weight_preprocess{allow_weight_preprocess} { - auto fastrun_batch_size = - owner_graph()->options().fast_run_config.shared_batch_size; - - if (fastrun_batch_size) { - LayoutsModifier::on(m_incache_layouts, m_dnn_opr->param(), 0); - LayoutsModifier::on( - m_fastrun_layouts, m_dnn_opr->param(), fastrun_batch_size); - } - - mgb_assert(m_fastrun_layouts.size() == layouts.size()); - - static_assert( - std::tuple_size::value == 2 || - std::tuple_size::value == 3 || - std::tuple_size::value == 4 || - std::tuple_size::value == 5 || - std::tuple_size::value == 8, - "Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes " - "arity = 3 , 5 or 8 (for deformable conv)"); -} - -template -typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelper:: - choose_by_heuristic(const ExecutionStrategy& selected_strategy) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_heuristic"))) - ImplExecutionPolicy policy; - auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - owner_graph(), m_cn, m_execution_policy.workspace_limit); - auto attr = extract_algo_attribute(selected_strategy); - policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic( - args..., workspace_limit, attr.first, attr.second), - m_fastrun_layouts) - .desc; - - Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); - mgb_assert(algo, "Unknown algo description"); - std::vector&& sub_items = - algo->get_subopr_list(to_layout_array(m_fastrun_layouts), m_dnn_opr); - - FOREACH_OPR_TYPE_DISPATCH(sub_items, { - auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); - megdnn_opr->param() = - Algorithm::deserialize_read_pod(_item.param); - typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( - to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, - m_base_mgb_opr, m_cn, m_execution_policy, m_allow_weight_preprocess); - policy.sub_policy.push_back(sub_helper.choose_by_heuristic(selected_strategy)); - }); - - return policy; - MIDOUT_E -} - -template -typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelper:: - choose_by_profile( - const ExecutionStrategy& selected_strategy, bool enable_update) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_profile"))) - if (owner_graph()->options().no_profiling_on_shape_change) { - auto policy = m_dnn_opr->execution_policy(); - if (policy.algo.valid()) { - return policy; - } - if (is_matmul()) { - mgb_log_warn( - "choose algo by heuristic, which may cause performance " - "regression."); - return choose_by_heuristic(selected_strategy); - } - } - - typename AlgoChooser::ImplExecutionPolicy tmp_policy; - bool retrive_from_cache = true; - bool allow_log = false; - construct_execution_policy( - selected_strategy, tmp_policy, retrive_from_cache, allow_log); - if (tmp_policy.algo.valid()) { - // return policy when contruct successed - return tmp_policy; - } - - if (enable_update) { - CircularDepsChecker circular_deps_checker; - auto&& search_items = flatten_search_space(*this, circular_deps_checker); - FOREACH_OPR_TYPE_DISPATCH(search_items, { - auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); - megdnn_opr->param() = - Algorithm::deserialize_read_pod(_item.param); - typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( - to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), - _item.param, m_base_mgb_opr, m_cn, m_execution_policy, - m_allow_weight_preprocess); - sub_helper.profile(selected_strategy); - }); - } - - typename AlgoChooser::ImplExecutionPolicy policy; - construct_execution_policy(selected_strategy, policy); - return policy; - MIDOUT_E -} - -template -std::pair< - typename AlgoChooser::ImplAlgoDesc, Maybe> -AlgoChooser::AlgoChooserHelper::get_profile_result_from_cache( - const ExecutionStrategy& selected_strategy) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_profile_result_from_cache"))) - AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); - - typename Opr::Param origin_param = m_dnn_opr->param(); - AlgoChooserProfileCache::Key cache_key{ - m_incache_layouts.data(), m_incache_layouts.size(), &origin_param, - sizeof(origin_param)}; - auto&& rst = cache.get(cache_key); - if (!rst.valid()) - return {{}, rst}; - - auto&& prof = rst.val(); - if (prof.empty()) - return {{}, rst}; - - size_t workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - owner_graph(), m_cn, m_execution_policy.workspace_limit); - auto target_attr = extract_algo_attribute(selected_strategy); - bool skip_by_negative = false; - bool skip_by_workspace = false; - for (auto&& i : prof) { - auto attr_of_algo = static_cast(i.attribute); - bool contain_attr_all_positive = - (target_attr.first == (attr_of_algo & target_attr.first)); - bool contain_attr_any_negative = - static_cast(attr_of_algo & target_attr.second); - if (contain_attr_all_positive) { - if (!contain_attr_any_negative) { - if (i.workspace <= workspace_limit) { - Algorithm::Info::Desc algo_desc = deserialize_read_pod(i.algo); - return {algo_desc, rst}; - } - skip_by_workspace = true; - } else { - skip_by_negative = true; - } - } - } - if (skip_by_workspace) - return {}; - - std::string layouts_str = - format_fixlayouts(m_fastrun_layouts, arity_in, arity_out); - if (skip_by_negative) { - mgb_log_error( - "opr: %s, layouts: %s, No usable algo. There are available " - "algos match " - "positive strategy(%s), but filtered by negative stategy(%s).", - m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), - Algorithm::attribute_str(target_attr.first).c_str(), - Algorithm::attribute_str(target_attr.second).c_str()); - } else { - mgb_log_error( - "opr: %s, layouts: %s, No usable algo. algos read from cache " - "could not " - "satisfy positive strategy(%s)", - m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), - Algorithm::attribute_str(target_attr.first).c_str()); - } - - mgb_trap(); - MIDOUT_E -} - -template -void AlgoChooser::AlgoChooserHelper::construct_execution_policy( - const ExecutionStrategy& selected_strategy, - typename AlgoChooser::ImplExecutionPolicy& policy, bool retrive_from_cache, - bool allow_log) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_execution_policy"))) - if (!policy.algo.valid()) { - if (retrive_from_cache) { - policy.algo = get_profile_result_from_cache(selected_strategy).first; - if (!policy.algo.valid()) { - if (allow_log) { - auto target_attr = extract_algo_attribute(selected_strategy); - std::string layouts_str = format_fixlayouts( - m_fastrun_layouts, arity_in, arity_out); - std::string msg = ssprintf( - "(opr : %s, layouts %s, with attribute(%s) and " - "without attribute(%s)", - m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), - Algorithm::attribute_str(target_attr.first).c_str(), - Algorithm::attribute_str(target_attr.second).c_str()); - mgb_log_warn( - "No algo get from cache for %s. This may caused by " - "mismatch with model and cache file or imcomplete " - "cache file. ex. profiling with version1, but " - "inferencing on version2 or profiling modelA but " - "inferencing modelB", - msg.c_str()); - } - return; - } - } else { - auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - owner_graph(), m_cn, m_execution_policy.workspace_limit); - - auto attr = extract_algo_attribute(selected_strategy); - policy.algo = - APPLY(m_dnn_opr->get_algorithm_info_heuristic( - args..., workspace_limit, attr.first, attr.second), - m_fastrun_layouts) - .desc; - mgb_assert( - policy.algo.valid(), - "No algo found from heuristic with strategy %u and " - "workspace limit %zu", - static_cast(selected_strategy), workspace_limit); - } - } - - Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo); - mgb_assert(algo, "Unknown algo description"); - std::vector&& sub_items = - algo->get_subopr_list(to_layout_array(m_fastrun_layouts), m_dnn_opr); - - FOREACH_OPR_TYPE_DISPATCH(sub_items, { - auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); - megdnn_opr->param() = - Algorithm::deserialize_read_pod(_item.param); - typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( - to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, - m_base_mgb_opr, m_cn, m_execution_policy, m_allow_weight_preprocess); - policy.sub_policy.push_back({}); - sub_helper.construct_execution_policy( - selected_strategy, policy.sub_policy.back(), retrive_from_cache, - allow_log); - if (!policy.sub_policy.back().algo.valid()) { - // means sub_helper.construct_execution_policy fails. clean up - // policy.algo and return - policy = {}; - return; - } - }); - MIDOUT_E -} - -template -size_t AlgoChooser::AlgoChooserHelper::get_workspace_size_bytes( - const ImplExecutionPolicy& policy, const FixedTensorLayouts& layouts) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes"))) - m_dnn_opr->execution_policy() = policy; - size_t result; - const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; - if (layouts.at(0).ndim) { - layouts_ptr = &layouts; - } - if_constexpr()>( - [&](auto _) { - auto&& opr = _(m_dnn_opr); - auto prep = this->construct_fake_preprocess_filter(*layouts_ptr); - PreprocessFilter* prep_ptr = prep.valid() ? &prep.val() : nullptr; - result = std::max( - APPLY(opr->get_preprocess_workspace_in_bytes(args...), - *layouts_ptr), - APPLY(opr->get_workspace_in_bytes(args..., prep_ptr), - *layouts_ptr)); - }, - /* else */ - [&](auto _) { - result = APPLY( - _(m_dnn_opr)->get_workspace_in_bytes(args...), *layouts_ptr); - }); - return result; - MIDOUT_E -} - -template -std::vector::ImplAlgo> AlgoChooser< - Opr>::AlgoChooserHelper::get_all_candidates() const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates"))) - auto heu = choose_by_heuristic(m_execution_policy.strategy); - auto&& ret = - APPLY(m_dnn_opr->get_all_algorithms_info_safe(args...), m_fastrun_layouts); - bool found = false; - for (size_t i = 0; i < ret.size(); ++i) { - if (ret[i].desc == heu.algo) { - found = true; - std::swap(ret[i], ret[0]); - break; - } - } - - Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(heu.algo); - mgb_assert(palgo, "Unknown algo description"); - mgb_assert( - found, - "algo %s got by heuristic not found in " - "candidate list", - palgo->name()); - return std::move(ret); - MIDOUT_E -} - -template -Maybe AlgoChooser::AlgoChooserHelper:: - profile_single_algo(const ImplExecutionPolicy& policy, double& timeout) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile_single_algo"))) - typename TimedProfiler::Param param; - // force check copy size <= dest len-1 from gcc8 for safe - param.execution_policy = - TimedProfiler::Param::ExecutionPolicyBlob::serialize(policy); - param.workspace = get_workspace_size_bytes(policy); - for (int i = 0; i < arity; ++i) { - auto&& src = m_fastrun_layouts[i]; - bool cond_normal = src.format.is_default() && - (src.dtype.category() == DTypeCategory::FLOAT || - src.dtype.category() == DTypeCategory::INT || - src.dtype.category() == DTypeCategory::QUANTIZED); - - bool cond_low_bit = src.dtype.is_low_bit() && src.format.is_lowbit_aligned() && - (src.dtype.category() == DTypeCategory::QUANTIZED || - src.dtype.category() == DTypeCategory::LOWBIT); - MGB_MARK_USED_VAR(cond_normal); - MGB_MARK_USED_VAR(cond_low_bit); - mgb_assert( - cond_normal || cond_low_bit, "unsupported layout in profiling: %s", - src.to_string().c_str()); - param.dtypes[i] = src.dtype.enumv(); - } - param.comp_node_physical = m_cn.locator(); - param.comp_node_logical = m_cn.locator_logical(); - mgb_assert(param.shapes.size() == m_fastrun_layouts.size()); - for (size_t i = 0; i < param.shapes.size(); ++i) - param.shapes[i] = m_fastrun_layouts[i]; - param.opr_param = m_dnn_opr->param(); - param.allow_weight_preprocess = m_allow_weight_preprocess; - - Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); - mgb_assert(palgo, "can not find algo when profile single algo"); - - auto rst = TimedProfiler::profile(param, timeout); - // MIOpen conv profiles all available algos when a specfic shape is - // provided for the first time, which probably adds to the result time. - // Therefore, a second profile execution is needed. - if (strncmp(palgo->name(), "MIOpen", 6) == 0) { - rst = TimedProfiler::profile(param, timeout); - } - if (!rst.valid()) - return None; - - std::string algo_desc; - serialize_write_pod(policy.algo, algo_desc); - return AlgoChooserProfileCache::ResultEntry{ - algo_desc, static_cast(palgo->attribute()), rst.val().time, - param.workspace}; - MIDOUT_E -} - -template -void AlgoChooser::AlgoChooserHelper::profile( - const ExecutionStrategy& selected_strategy) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile"))) - auto&& rst = get_profile_result_from_cache(selected_strategy); - if (rst.first.valid()) - return; - AlgoChooserProfileCache::Result prof_rst; - - auto target_attr = extract_algo_attribute(selected_strategy); - std::string layouts_str = - format_fixlayouts(m_fastrun_layouts, arity_in, arity_out); - double cur_timeout = 0; - - auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - owner_graph(), m_cn, m_execution_policy.workspace_limit); - RealTimer timer; - std::unordered_set rst_algos; - if (rst.second.valid()) { - std::transform( - rst.second.val().begin(), rst.second.val().end(), - std::inserter(rst_algos, rst_algos.end()), - [](const AlgoChooserProfileCache::ResultEntry& result) { - return result.algo; - }); - } - for (auto algo : get_all_candidates()) { - std::string desc; - serialize_write_pod(algo.desc, desc); - if (rst_algos.find(desc) != rst_algos.end()) { - continue; - } - Maybe cur_rst; - - ImplExecutionPolicy policy; - policy.algo = algo.desc; - - //! check negative attribute : skip negative attribute - auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); - if (palgo->contain_attribute_any(target_attr.second)) { - mgb_log_debug( - "skip algo %s, which matches the profile strategy required " - "'not contain attribute(%s).'", - algo.desc.name.c_str(), - Algorithm::attribute_str(target_attr.second).c_str()); - continue; - } - - //! check workspace limit - construct_execution_policy(selected_strategy, policy); - mgb_assert( - policy.algo.valid(), - "construct execution policy must success when profiling"); - if (get_workspace_size_bytes(policy) > workspace_limit) { - continue; - } - - std::string msg = ssprintf( - "profiling %s algorithm %s %s", m_base_mgb_opr->dyn_typeinfo()->name, - algo.desc.name.c_str(), layouts_str.c_str()); - timer.reset(); - MGB_TRY { cur_rst = profile_single_algo(policy, cur_timeout); } - MGB_CATCH(std::exception & exc, { - mgb_log_warn("caught exception during %s: %s", msg.c_str(), exc.what()); - continue; - }) - MGB_CATCH(..., { - mgb_log_warn("caught exception during %s", msg.c_str()); - continue; - }) - if (!cur_rst.valid()) { - mgb_log_warn( - "timeout when %s; timeout setting: %.3fsec", msg.c_str(), - cur_timeout); - continue; - } - if (!cur_timeout) { - cur_timeout = timer.get_secs() + TIMEOUT_TOLERANCE; - } else { - cur_timeout = std::min(cur_timeout, timer.get_secs() + TIMEOUT_TOLERANCE); - } - auto&& rst = cur_rst.val(); - mgb_log_debug( - "%s: workspace: %zu; time: %.3gsec", msg.c_str(), rst.workspace, - rst.time); - prof_rst.push_back(rst); - } - std::string msg = ssprintf( - "no usable %s algorithm %s without attribute(%s) or could not meet " - "workspace limite requirement(%zu)", - m_base_mgb_opr->dyn_typeinfo()->name, layouts_str.c_str(), - Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); - mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); - if (rst.second.valid()) - prof_rst.insert( - prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); - - FixedTensorLayouts incache_layouts = m_incache_layouts; - typename Opr::Param origin_param = m_dnn_opr->param(); - AlgoChooserProfileCache::Key cache_key{ - incache_layouts.data(), incache_layouts.size(), &origin_param, - sizeof(origin_param)}; - - AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); - cache.put(cache_key, prof_rst); - MIDOUT_E -} - -template -Maybe> AlgoChooser::AlgoChooserHelper:: - construct_fake_preprocess_filter(const FixedTensorLayouts& layouts) const { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter"))) - Maybe> result = None; - const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts; - if (layouts.at(0).ndim) { - layouts_ptr = &layouts; - } - if_constexpr()>([&](auto _) { - if (!m_allow_weight_preprocess) - return; - auto opr = _(m_dnn_opr); - auto layouts = - APPLY(opr->deduce_preprocessed_filter_layout(args...), *layouts_ptr); - //! No preprocess layout means no need weight preprocess - if (layouts.empty()) { - return; - } - //! all layouts arm empty means no need weight preprocess - bool layout_valid = false; - for (auto&& layout : layouts) { - if (!layout.is_empty()) { - layout_valid = true; - } - } - if (!layout_valid) { - return; - } - - result = PreprocessFilter{}; - auto& res = result.val(); - res.algorithm_id = nullptr; - res.tensors.resize(layouts.size()); - for (size_t i = 0; i < layouts.size(); i++) { - res.tensors[i] = megdnn::TensorND(nullptr, layouts[i]); - } - }); - return result; - MIDOUT_E -} - -template -std::pair AlgoChooser::AlgoChooserHelper:: - extract_algo_attribute(const ExecutionStrategy& strategy) const { - std::pair ret = - std::make_pair(AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT); - - //! from strategy - if (strategy & ExecutionStrategy::REPRODUCIBLE) { - ret.first |= AlgoAttribute::REPRODUCIBLE; - } - if (strategy & ExecutionStrategy::OPTMIZED) { - ret.second |= AlgoAttribute::NAIVE; - } - - //! from graph option - if (owner_graph()->options().fast_run_config.shared_batch_size) { - ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE; - } - - if (owner_graph()->options().fast_run_config.binary_equal_between_batch) { - ret.first |= AlgoAttribute::REPRODUCIBLE; - ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH; - } - - return ret; -} - -#define INST(Opr) \ - template AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( \ - const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ - const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, \ - const CompNode& cn, \ - const megdnn::param::ExecutionPolicy& execution_policy, \ - bool allow_weight_preprocess); \ - template typename AlgoChooser::ImplExecutionPolicy \ - AlgoChooser::AlgoChooserHelper::choose_by_heuristic( \ - const ExecutionStrategy& select_strategy) const; \ - template typename AlgoChooser::ImplExecutionPolicy \ - AlgoChooser::AlgoChooserHelper::choose_by_profile( \ - const ExecutionStrategy& select_strategy, bool enable_update) const; \ - template std::pair< \ - typename AlgoChooser::ImplAlgoDesc, \ - Maybe> \ - AlgoChooser::AlgoChooserHelper::get_profile_result_from_cache( \ - const ExecutionStrategy& select_strategy) const; \ - template void \ - AlgoChooser::AlgoChooserHelper::construct_execution_policy( \ - const ExecutionStrategy& select_strategy, \ - typename AlgoChooser::ImplExecutionPolicy& policy, \ - bool retrive_from_cache, bool allow_log) const; \ - template size_t \ - AlgoChooser::AlgoChooserHelper::get_workspace_size_bytes( \ - const typename AlgoChooser::ImplExecutionPolicy& policy, \ - const FixedTensorLayouts& layouts) const; \ - template std::vector::ImplAlgo> \ - AlgoChooser::AlgoChooserHelper::get_all_candidates() const; \ - template Maybe \ - AlgoChooser::AlgoChooserHelper::profile_single_algo( \ - const typename AlgoChooser::ImplExecutionPolicy& policy, \ - double& timeout) const; \ - template std::pair \ - AlgoChooser::AlgoChooserHelper::extract_algo_attribute( \ - const ExecutionStrategy& strategy) const; \ - template void AlgoChooser::AlgoChooserHelper::profile( \ - const ExecutionStrategy& selected_strategy) const; - -MGB_FOREACH_FASTRUN_OPR(INST) -#undef INST - -//////////////////////////////// AlgoChoose ///////////////////////////// -template -typename AlgoChooser::ImplExecutionPolicy AlgoChooser::get_policy( - const AlgoChooserHelper& helper) { - auto opr_strategy = helper.execution_policy().strategy; - auto strategy2str = [](auto strategy) { - std::string ret; - if (strategy & ExecutionStrategy::HEURISTIC) { - ret += "HEURISTIC "; - } - if (strategy & ExecutionStrategy::PROFILE) { - ret += "PROFILE "; - } - if (strategy & ExecutionStrategy::REPRODUCIBLE) { - ret += "REPRODUCIBLE "; - } - if (strategy & ExecutionStrategy::OPTIMIZED) { - ret += "OPTIMIZED "; - } - return ret; - }; - mgb_log_debug("Use Stragegy :%s", strategy2str(opr_strategy).c_str()); - if (opr_strategy & ExecutionStrategy::HEURISTIC) { - if (opr_strategy & ExecutionStrategy::PROFILE) { - //! this strategy will choose from cache first, then choost by - //! heuristic if fail. - ImplExecutionPolicy policy = helper.choose_by_profile(opr_strategy, false); - if (!policy.algo.valid()) { - policy = helper.choose_by_heuristic(opr_strategy); - } - return policy; - } else { - return helper.choose_by_heuristic(opr_strategy); - } - } -#if MGB_ENABLE_FASTRUN - else if (opr_strategy & ExecutionStrategy::PROFILE) { - return helper.choose_by_profile(opr_strategy, true); - } -#endif - else { - mgb_throw(GraphError, "bad ExecutionPolicy strategy"); - } -} template size_t AlgoChooser::setup_algo( @@ -1139,26 +49,38 @@ size_t AlgoChooser::setup_algo( std::string param_str; Algorithm::serialize_write_pod(megdnn_opr->param(), param_str); + + auto cg = mgb_opr->owner_graph(); + rdnn::AlgoChooserDesc desc; + desc.shared_batch_size = cg->options().fast_run_config.shared_batch_size; + desc.binary_equal_between_batch = + cg->options().fast_run_config.binary_equal_between_batch; + desc.no_profiling_on_shape_change = cg->options().no_profiling_on_shape_change; + desc.get_workspace_limit = [&](CompNode cn, size_t old_limit) { + return WorkspaceLimitGetter::get_workspace_limit(cg, cn, old_limit); + }; + AlgoChooserHelper helper( - layouts, megdnn_opr, param_str, mgb_opr, mgb_opr->comp_node(), - mgb_opr->execution_policy(), allow_weight_preprocess); + layouts, megdnn_opr, param_str, mgb_opr->comp_node(), + mgb_opr->execution_policy(), allow_weight_preprocess, desc); ImplExecutionPolicy policy; if (auto algo_choose_hook = mgb_opr->algo_chooser()) { policy = algo_choose_hook(mgb_opr); - auto strategy = ExecutionStrategy::HEURISTIC | ExecutionStrategy::REPRODUCIBLE; + auto strategy = rdnn::ExecutionStrategy::HEURISTIC | + rdnn::ExecutionStrategy::REPRODUCIBLE; bool retrive_from_cache = false; helper.construct_execution_policy(strategy, policy, retrive_from_cache); } if (!policy.algo.valid()) { - policy = get_policy(helper); + policy = Base::get_policy(helper); } size_t workspace = helper.get_workspace_size_bytes(policy, layouts); std::string ret; ret.append(mgb_opr->dyn_typeinfo()->name); ret.append(": tensor layouts"); - ret += format_fixlayouts(layouts, arity_in, arity_out); + ret += Base::format_fixlayouts(layouts); Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); mgb_assert(palgo, "Unknown algo description"); ret.append("): algo=" + std::string(palgo->name())); @@ -1169,18 +91,16 @@ size_t AlgoChooser::setup_algo( megdnn_opr->execution_policy() = policy; - if (mgb_opr->execution_policy().strategy & ExecutionStrategy::HEURISTIC) { + if (mgb_opr->execution_policy().strategy & rdnn::ExecutionStrategy::HEURISTIC) { HeuristicCache::Result cache_result{policy, workspace}; HeuristicCache::instance().put(cache_key, cache_result); } return workspace; } -#define INST(Opr) \ - template AlgoChooser::ImplExecutionPolicy \ - AlgoChooser::get_policy(const AlgoChooserHelper& proxy); \ - template size_t AlgoChooser::setup_algo( \ - const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ +#define INST(Opr) \ + template size_t AlgoChooser::setup_algo( \ + const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ const MGBOpr* mgb_opr, bool allow_weight_preprocess); MGB_FOREACH_FASTRUN_OPR(INST) diff --git a/src/opr/impl/search_policy/profiler.cpp b/src/opr/impl/search_policy/profiler.cpp deleted file mode 100644 index 0b3f2d5c2..000000000 --- a/src/opr/impl/search_policy/profiler.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/** - * \file src/opr/impl/search_policy/profile.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or - * implied. - */ - -#include "megbrain/opr/search_policy/profiler.h" - -#include "../internal/invoke.h" -#include "../internal/megdnn_opr_wrapper.inl" -#include "megdnn/handle.h" -#include "megdnn/oprs/base.h" - -#if MGB_ROCM -#include "hcc_detail/hcc_defs_prologue.h" -#include "megcore_rocm.h" -#endif - -//! TODO: here has to be know some megdnn::opr when there is produced midout.h -//! fix it if there is another graceful way. -#include "megdnn/oprs.h" - -#include "midout.h" - -MIDOUT_DECL(megbrain_opr_profile) -#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) { -#define MIDOUT_E \ - } \ - MIDOUT_END(); - -namespace { -std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { - std::string ret; - //! serialize AlgorithmDesc - megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); - megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); - uint32_t param_size = policy.algo.param.size(); - uint32_t name_size = policy.algo.name.size(); - megdnn::Algorithm::serialize_write_pod(param_size, ret); - megdnn::Algorithm::serialize_write_pod(name_size, ret); - ret += policy.algo.param; - ret += policy.algo.name; - - //! serialize sub_policy - uint32_t size = policy.sub_policy.size(); - megdnn::Algorithm::serialize_write_pod(size, ret); - for (auto&& sub : policy.sub_policy) { - ret += serialize_policy(sub); - } - return ret; -} - -megdnn::ExecutionPolicy deserialize_policy( - const char* buf, uint32_t size, uint32_t& offset) { - megdnn::ExecutionPolicy ret; -#define cb(_val, _type) \ - _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ - offset += sizeof(_val) - - cb(ret.algo.handle_type, megdnn::Handle::HandleType); - cb(ret.algo.type, uint32_t); - - uint32_t param_size = 0; - uint32_t name_size = 0; - cb(param_size, uint32_t); - cb(name_size, uint32_t); - if (param_size > 0) { - ret.algo.param = std::string(buf + offset, param_size); - offset += param_size; - } - if (name_size > 0) { - ret.algo.name = std::string(buf + offset, name_size); - offset += name_size; - } - - uint32_t nr_policy = 0; - cb(nr_policy, uint32_t); -#undef cb - - for (uint32_t i = 0; i < nr_policy; i++) { - ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); - } - return ret; -} -} // namespace - -namespace mgb { -namespace opr { -#define APPLY(statement, ...) \ - mgb::apply( \ - [&](const auto&... args) { return statement; }, \ - std::tuple_cat(__VA_ARGS__)) - -////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// - -template -typename TimedProfiler::Param::ExecutionPolicyBlob TimedProfiler::Param:: - ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) { - ExecutionPolicyBlob ret; - std::string serialize_bin = serialize_policy(policy); - mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); - memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); - ret.size = serialize_bin.size(); - return ret; -} - -template -megdnn::ExecutionPolicy TimedProfiler::Param::ExecutionPolicyBlob::deserialize() - const { - uint32_t offset = 0; - auto&& ret = deserialize_policy(data, size, offset); - mgb_assert(offset == size); - return std::move(ret); -} - -#define INST(Opr) \ - template typename TimedProfiler::Param::ExecutionPolicyBlob \ - TimedProfiler::Param::ExecutionPolicyBlob::serialize( \ - const megdnn::ExecutionPolicy& policy); \ - template megdnn::ExecutionPolicy \ - TimedProfiler::Param::ExecutionPolicyBlob::deserialize() const; - -MGB_FOREACH_FASTRUN_OPR(INST) -#undef INST - -////////////////// TimedProfiler ////////////////////////////// - -template -const double TimedProfiler::timeout_setting = - TimedProfiler::init_timeout_setting(); - -template -double TimedProfiler::init_timeout_setting() { -#if MGB_ENABLE_FASTRUN - sys::TimedFuncInvoker::ins().register_func( - AlgoChooserFuncId::ID, &TimedProfiler::prof_impl, - &TimedProfiler::prof_init_device); - auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT"); - if (to_set) - return std::stod(to_set); -#endif - return 0; -} - -#define APPLY(statement, ...) \ - mgb::apply( \ - [&](const auto&... args) { return statement; }, \ - std::tuple_cat(__VA_ARGS__)) - -template -void TimedProfiler::preprocess( - const TensorLayoutArray&, const megdnn::SmallVector&, - intl::UniqPtrWithCN&, megdnn::Workspace&, std::array&, - std::array&, PreprocessFilter&) { - // Opr is neither convbias nor convolution.This function do nothing. -} - -//! convbias -template <> -void TimedProfiler::preprocess( - const TensorLayoutArray& preprocessed_layout, - const SmallVector& flt_val, - intl::UniqPtrWithCN& megdnn_opr, - megdnn::Workspace& mdn_workspace, std::array& layouts, - std::array& inp_val, - PreprocessFilter& prep_flt) { - if (!preprocessed_layout.empty()) { - auto&& pf = prep_flt; - pf.algorithm_id = nullptr; - pf.tensors.resize(flt_val.size()); - for (size_t i = 0; i < flt_val.size(); i++) { - pf.tensors[i] = flt_val[i].as_megdnn(); - } - APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), - std::forward_as_tuple( - layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), - array_skip(layouts)); - } -} - -//! convolution -template <> -void TimedProfiler::preprocess( - const TensorLayoutArray& preprocessed_layout, - const megdnn::SmallVector& flt_val, - intl::UniqPtrWithCN& megdnn_opr, - megdnn::Workspace& mdn_workspace, std::array& layouts, - std::array& inp_val, - PreprocessFilter& prep_flt) { - if (!preprocessed_layout.empty()) { - auto&& pf = prep_flt; - pf.algorithm_id = nullptr; - pf.tensors.resize(flt_val.size()); - for (size_t i = 0; i < flt_val.size(); i++) { - pf.tensors[i] = flt_val[i].as_megdnn(); - } - APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), - std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), - array_skip<2>(layouts)); - } -} - -template -typename TimedProfiler::TResult TimedProfiler::prof_impl( - const TParam& raw_param) { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl"))) -#if MGB_ROCM - bool miopen_algo_search_enabled; - megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled); - mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled"); -#endif - auto&& param = raw_param.as_single_pod(); - CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); - auto megdnn_opr = intl::create_megdnn_opr(cn); - std::array layouts; - - auto from_enum = [&](DTypeEnum enumv) -> DType { - switch (enumv) { - -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return _dt(1.0f, static_cast(0)) - cb(dtype::Quantized8Asymm); - cb(dtype::Quantized4Asymm); -#undef cb - -#define cb(_dt) \ - case DTypeTrait<_dt>::enumv: \ - return _dt(1.0f) - - cb(dtype::QuantizedS8); - cb(dtype::QuantizedS16); - cb(dtype::QuantizedS32); - cb(dtype::QuantizedS4); - default: - return DType::from_enum(enumv); -#undef cb - } - }; - for (int i = 0; i < arity; ++i) { - layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])}; - } - - megdnn_opr->param() = param.opr_param; - megdnn_opr->execution_policy() = param.execution_policy.deserialize(); - - // Allocate preprocessed weight buffers. - TensorLayoutArray preprocessed_layout; - if_constexpr()>([&](auto _) { - if (param.allow_weight_preprocess) { - preprocessed_layout = APPLY( - _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts); - } - }); - - { - // first allocate a whole chunk to avoid memory fragmentation (here we - // rely on memory allocator to reuse memory) - auto align = cn.get_mem_addr_alignment(); - size_t tot_size = align; - for (int i = 0; i < arity; ++i) { - tot_size += layouts[i].span().high_byte + align; - } - for (const auto& layout : preprocessed_layout) { - tot_size += layout.span().high_byte + align; - } - tot_size += param.workspace; - DeviceTensorStorage storage{cn}; - storage.ensure_size(tot_size); - } - - // allocate input and output memory - std::array inp_val; - std::array out_val; - DeviceTensorND workspace; - for (int i = 0; i < arity_in; ++i) { - inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); - } - for (int i = 0; i < arity_out; ++i) { - out_val[i] - .comp_node(cn) - .dtype(layouts[arity_in + i].dtype) - .resize(layouts[arity_in + i]); - } - megdnn::Workspace mdn_workspace; - - // allocate workspace - if (param.workspace) { - workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); - mdn_workspace.size = param.workspace; - mdn_workspace.raw_ptr = workspace.raw_ptr(); - } - - // allocate storage for preprocessed filter - SmallVector flt_val(preprocessed_layout.size()); - for (size_t i = 0; i < preprocessed_layout.size(); i++) { - flt_val[i] = { - cn, preprocessed_layout[i], preprocessed_layout[i].dtype, - preprocessed_layout[i].format}; - } - - for (int i = 0; i < arity_in; ++i) { - fill_zero_dev_tensor(inp_val[i]); - } - - PreprocessFilter prep_flt; - preprocess( - preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, - prep_flt); - - RealTimer timer; - auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER), - ev_end = cn.create_event(CompNode::Event::NEED_TIMER); - ev_start->record(); - if_constexpr()>( - [&](auto _) { - auto&& opr = _(megdnn_opr); - PreprocessFilter* pf = - preprocessed_layout.empty() ? nullptr : &prep_flt; - APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, - out_val); - }, - /* else */ - [&](auto _) { - APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, - out_val); - }); - ev_end->record(); - - megdnn::Algorithm* algo = - megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); - mgb_assert(algo); - double next_report_time = 0.5; - while (!ev_end->finished()) { - if (timer.get_secs() >= next_report_time) { -#if MGB_ENABLE_GETENV - mgb_log_warn( - "profiling conv algo %s already took %.3f/%.3f secs" - " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", - algo->name(), timer.get_secs(), param.actual_timeout); -#else - mgb_log_warn( - "profiling conv algo %s already took %.3f/%.3f secs", algo->name(), - timer.get_secs(), param.actual_timeout); -#endif - next_report_time = timer.get_secs() + 1; - } - using namespace std::literals; -#if !__DEPLOY_ON_XP_SP2__ - std::this_thread::sleep_for(1000us); -#endif - } - // release all free blocks owned by child process, - // in order to avoid main process running out of memory - cn.try_coalesce_all_free_memory(); - - mgb_assert(ev_start->finished()); - return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)}); - MIDOUT_E -}; - -template -Maybe::Result> TimedProfiler::profile( - const Param& param, double& timeout) { - mgb_assert(timeout >= 0); - if (!timeout) { - timeout = timeout_setting; - } else if (timeout_setting) { - timeout = std::min(timeout, timeout_setting); - } - param.actual_timeout = timeout ? timeout : std::numeric_limits::infinity(); - auto res = sys::TimedFuncInvoker::ins().invoke( - AlgoChooserFuncId::ID, TParam::from_pod(const_cast(param)), - timeout); - if (res.valid()) - return res.val().template as_single_pod(); - return None; -} - -template -void TimedProfiler::prof_init_device(const TParam& raw_param) { - MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device"))) -#if MGB_ROCM - megcore::enableMIOpenAlgoSearch(true); -#endif - auto&& param = raw_param.as_single_pod(); - CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical); - // wait for cuda init, so its time does not get accounted in timeout - cn.sync(); - MIDOUT_E -} - -#define INST(Opr) \ - template const double TimedProfiler::timeout_setting; \ - template double TimedProfiler::init_timeout_setting(); \ - template typename TimedProfiler::TResult \ - TimedProfiler::prof_impl(const TParam& raw_param); \ - template Maybe::Result> \ - TimedProfiler::profile(const Param& param, double& timeout); \ - template void TimedProfiler::prof_init_device(const TParam& raw_param); - -MGB_FOREACH_FASTRUN_OPR(INST) -#undef INST -} // namespace opr -} // namespace mgb - -// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/impl/search_policy/workspace_need_limit_getter.inl b/src/opr/impl/search_policy/workspace_need_limit_getter.inl index d8defec9a..1eb9f8823 100644 --- a/src/opr/impl/search_policy/workspace_need_limit_getter.inl +++ b/src/opr/impl/search_policy/workspace_need_limit_getter.inl @@ -12,7 +12,7 @@ #pragma once -#include "megbrain/opr/search_policy/profiler.h" +#include "megbrain/opr/search_policy/algo_chooser.h" #include "../internal/megdnn_opr_wrapper.inl" @@ -25,7 +25,7 @@ namespace intl { struct AutoAddWorkspaceNeedLimitGetter { \ static constexpr bool val = true; \ }; -MGB_FOREACH_FASTRUN_OPR(cb) +DNN_FOREACH_FASTRUN_OPR(cb) #undef cb diff --git a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h index 7491054cf..fedc3dc58 100644 --- a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h +++ b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h @@ -13,6 +13,7 @@ #include "megbrain/graph.h" #include "megbrain/opr/internal/mixin_base.h" +#include "megbrain/rdnn/management.h" #include "megdnn/handle.h" @@ -20,43 +21,6 @@ namespace mgb { namespace opr { namespace intl { -//! get megdnn handle from comp node -MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node); -MGE_WIN_DECLSPEC_FUC std::shared_ptr get_megdnn_handle_shared( - CompNode comp_node); - -/*! - * \brief get global megdnn operator asscoated with a computing node - * \tparam Opr megdnn operator class, must be one of: - * * AddUpdate - * * Relayout - * * Checksum - */ -template -MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node); - -template -class UniqPtrWithCN : public std::unique_ptr { - CompNode m_cn; - -public: - UniqPtrWithCN() = default; - - template - UniqPtrWithCN(UniqPtrWithCN&& o) - : std::unique_ptr(std::move(o)), m_cn(o.comp_node()) {} - - UniqPtrWithCN(std::unique_ptr ptr, CompNode cn) - : std::unique_ptr{std::move(ptr)}, m_cn{cn} {} - - CompNode comp_node() const { return m_cn; } -}; - -//! create megdnn opr from megdnn handle in a CompNode -template -UniqPtrWithCN create_megdnn_opr(CompNode comp_node) { - return {get_megdnn_handle(comp_node)->create_operator(), comp_node}; -} /*! * \brief get temporary storage for oprs diff --git a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h index 6ebdda201..bc2f91bd6 100644 --- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h +++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h @@ -19,7 +19,7 @@ #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h" -#include "megbrain/opr/search_policy/profiler.h" +#include "megbrain/rdnn/algo_chooser.h" #include "megdnn/oprs/base.h" template @@ -31,18 +31,13 @@ struct MegDNNOpr2MGBOpr; using MGBOpr = mgb::opr::_Opr; \ }; -MGB_FOREACH_FASTRUN_OPR(cb) +DNN_FOREACH_FASTRUN_OPR(cb) #undef cb -namespace mgb { - -//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum -//! and megdnn::detail::AlgoAttribute enum -using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy; - -using AlgoAttribute = megdnn::AlgoAttribute; +#define MGB_FOREACH_FASTRUN_OPR(cb) DNN_FOREACH_FASTRUN_OPR(cb) +namespace mgb { namespace opr { /* =================== AlgoChooser =================== */ @@ -56,138 +51,14 @@ namespace opr { * \tparam Opr megdnn operator impl */ template -class AlgoChooser { - static constexpr int arity_in = OprArityTrait::arity_in; - static constexpr int arity_out = OprArityTrait::arity_out; - static constexpr int arity = OprArityTrait::arity; - - using ImplAlgo = typename Opr::AlgorithmInfo; - using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc; - using ImplExecutionPolicy = megdnn::ExecutionPolicy; +class AlgoChooser : public rdnn::AlgoChooser { + using Base = rdnn::AlgoChooser; using MGBOpr = typename MegDNNOpr2MGBOpr::MGBOpr; + using ImplExecutionPolicy = typename Base::ImplExecutionPolicy; public: - using FixedTensorLayouts = std::array; - class AlgoChooserHelper { - //! fastrun layouts - FixedTensorLayouts m_fastrun_layouts; - //! layouts used when get and set cache item - FixedTensorLayouts m_incache_layouts; - Opr* m_dnn_opr; - std::string m_param; - const cg::OperatorNodeBase* m_base_mgb_opr; - CompNode m_cn; - megdnn::param::ExecutionPolicy m_execution_policy; - bool m_allow_weight_preprocess; - - public: - AlgoChooserHelper( - const FixedTensorLayouts& layouts, Opr* megdnn_opr, - const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, - const CompNode& cn, - const megdnn::param::ExecutionPolicy& execution_policy, - bool allow_weight_preprocess); - - Opr* megdnn_opr() const { return m_dnn_opr; } - - const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } - - const TensorLayout& inp_layout(size_t idx) const { - return m_fastrun_layouts[idx]; - } - cg::ComputingGraph* owner_graph() const { - return m_base_mgb_opr->owner_graph(); - } - const megdnn::param::ExecutionPolicy& execution_policy() const { - return m_execution_policy; - } - CompNode comp_node() const { return m_cn; } - const std::string& param() const { return m_param; } - - bool allow_weight_preprocess() const { return m_allow_weight_preprocess; } - - megdnn::Algorithm* get_algorithm_from_desc( - const megdnn::Algorithm::Info::Desc& desc) const { - return m_dnn_opr->get_algorithm_from_desc(desc); - } - - const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; } - - const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; } - - //! construct algo chain by heuristic - ImplExecutionPolicy choose_by_heuristic( - const ExecutionStrategy& selected_strategy) const; - - //! construct algo chain by profiling - ImplExecutionPolicy choose_by_profile( - const ExecutionStrategy& selected_strategy, bool enable_update) const; - - //! get all profile algorithm from cache, return invalid if not exists - std::pair> - get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const; - - /** - * \brief construct execution policy from cache or heuristic. - * - * \param selected_strategy select algo which matched this strategy - * \param[in,out] policy execution policy - * \param retrive_from_cache retrive algo from cache if set True, get - * from heuristic otherwise. - * \param allow_log no warning log print if set True, print warning info - * otherwise. - */ - void construct_execution_policy( - const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy, - bool retrive_from_cache = true, bool allow_log = true) const; - - //! get workspace size required for specific execution policy - size_t get_workspace_size_bytes( - const ImplExecutionPolicy& policy, - const FixedTensorLayouts& layouts = {}) const; - - //! get all candidate algos, and the one choose_by_heuristic() is - //! put first - std::vector get_all_candidates() const; - - /*! - * \brief profile a single algorithm - * - * This is actually a wrapper that constructs param and call - * TimedProfiler::profile for the actual profiling - * - * \param[in,out] timeout set the timeout, and return the actual - * timeout used during profiling - */ - Maybe profile_single_algo( - const ImplExecutionPolicy& policy, double& timeout) const; - - //! profile and save to cache - void profile(const ExecutionStrategy& selected_strategy) const; - - /** - * \brief extract algo attribute from execution strategy and graph - * option. - * - * \param strategy select algo which matched this strategy - * \return pair - */ - std::pair extract_algo_attribute( - const ExecutionStrategy& strategy) const; - - private: - Maybe> construct_fake_preprocess_filter( - const FixedTensorLayouts& layouts = {}) const; - }; - - template - friend class AlgoChooser; - -private: - //! entrance for getting algorithm according to execution strategy - static ImplExecutionPolicy get_policy(const AlgoChooserHelper& helper); - -public: + using AlgoChooserHelper = typename Base::AlgoChooserHelper; + using FixedTensorLayouts = typename Base::FixedTensorLayouts; /*! * \brief setup algorithm and return workspace size */ diff --git a/src/opr/include/megbrain/opr/search_policy/profiler.h b/src/opr/include/megbrain/opr/search_policy/profiler.h deleted file mode 100644 index 59c465515..000000000 --- a/src/opr/include/megbrain/opr/search_policy/profiler.h +++ /dev/null @@ -1,165 +0,0 @@ -/** - * \file src/opr/include/megbrain/opr/search_policy/profile.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or - * implied. - */ - -#pragma once - -#include "megbrain/comp_node.h" -#include "megbrain/opr/internal/megdnn_opr_wrapper.h" -#include "megbrain/system.h" -#include "megbrain/tensor.h" -#include "megbrain/utils/hash_ct.h" -#include "megbrain/utils/timer.h" - -#include "megdnn/basic_types.h" -#include "megdnn/oprs.h" - -namespace mgb { -namespace opr { - -// clang-format off -#define MGB_FOREACH_FASTRUN_OPR(cb) \ - cb(ConvolutionForward) \ - cb(ConvBiasForward) \ - cb(ConvolutionBackwardData) \ - cb(ConvolutionBackwardFilter) \ - cb(Convolution3DForward) \ - cb(Convolution3DBackwardData) \ - cb(Convolution3DBackwardFilter) \ - cb(LocalShareForward) \ - cb(LocalShareBackwardData) \ - cb(LocalShareBackwardFilter) \ - cb(DeformableConvForward) \ - cb(DeformableConvBackwardFilter) \ - cb(DeformableConvBackwardData) \ - cb(BatchConvBiasForward) \ - cb(MatrixMul) \ - cb(BatchedMatrixMul) \ - cb(PoolingForward) \ - cb(PoolingBackward) -// clang-format on - -template -constexpr bool opr_supports_preprocess() { - return std::is_same::value || - std::is_same::value; -} - -template -constexpr bool opr_contain_bias() { - return std::is_same::value; -} - -//! matmul and batchedMatrixMul -template -constexpr bool is_matmul() { - return std::is_same::value || - std::is_same::value; -} - -template -struct PreprocessFilterImpl { - using T = union {}; -}; - -template -struct PreprocessFilterImpl { - using T = typename Opr::PreprocessedFilter; -}; - -template -using PreprocessFilter = - typename PreprocessFilterImpl()>::T; - -template -struct AlgoChooserFuncId {}; - -#define DEF_FUNC_ID(func) \ - template <> \ - struct AlgoChooserFuncId { \ - __attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \ - static_cast( \ - MGB_HASH_STR("megdnn::" #func)); \ - }; - -MGB_FOREACH_FASTRUN_OPR(DEF_FUNC_ID) - -#undef DEF_FUNC_ID - -/* =================== TimedProfiler =================== */ - -/*! - * \brief profile a megdnn opr conv with given param - * - * This class only provides static methods, and the entry point is - * TimedProfiler::profile; it would run profiler in a timed environment by - * sys::TimedFuncInvoker - * - * \tparam Opr megdnn opr impl - */ -template -class TimedProfiler { - static constexpr int arity_in = OprArityTrait::arity_in; - static constexpr int arity_out = OprArityTrait::arity_out; - static constexpr int arity = OprArityTrait::arity; - - using TensorShapeArray = std::array; - -public: - struct Param { - struct ExecutionPolicyBlob { - //! enlarge the max size if needed - constexpr static size_t MAX_SIZE_IN_BYTES = 10240; - char data[MAX_SIZE_IN_BYTES]; - uint32_t size; - - static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy); - megdnn::ExecutionPolicy deserialize() const; - }; - ExecutionPolicyBlob execution_policy; - size_t workspace; - megdnn::DTypeEnum dtypes[arity]; - CompNode::Locator comp_node_physical, comp_node_logical; - TensorShapeArray shapes; - typename Opr::Param opr_param; - bool allow_weight_preprocess; - - //! filled by profile() - mutable double actual_timeout; - }; - - struct Result { - double time; - }; - - static Maybe profile(const Param& param, double& timeout); - -private: - using TParam = sys::TimedFuncInvoker::Param; - using TResult = sys::TimedFuncInvoker::Result; - - static const double timeout_setting; - - static double init_timeout_setting(); - static void preprocess( - const megdnn::TensorLayoutArray& preprocessed_layout, - const SmallVector& flt_val, - intl::UniqPtrWithCN& megdnn_opr, megdnn::Workspace& mdn_workspace, - std::array& layouts, - std::array& inp_val, - PreprocessFilter& prep_flt); - static TResult prof_impl(const TParam& raw_param); - static void prof_init_device(const TParam& raw_param); -}; -} // namespace opr -} // namespace mgb - -// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} -- GitLab