#pragma once #include "src/common/opr_trait.h" #include "test/common/deduce_layout_proxy.h" #include "test/common/exec_proxy.h" #include "test/common/fast_run_cache.h" #include "test/common/inspect_type.h" #include "test/common/opr_algo_proxy.h" #include "test/common/timer.h" #include "test/common/workspace_wrapper.h" #include #include #include #include namespace megdnn { namespace test { template struct OprFromOprTypeTrait; template struct OprTypeFromOprTrait; #define cb(_opr_type, _opr) \ template <> \ struct OprFromOprTypeTrait { \ using Opr = megdnn::_opr; \ }; \ template <> \ struct OprTypeFromOprTrait { \ constexpr static Algorithm::OprType opr_type = Algorithm::OprType::_opr_type; \ } cb(MATRIX_MUL_FORWARD, MatrixMulForward); cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); cb(CONVOLUTION_FORWARD, ConvolutionForward); cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); cb(CONVOLUTION3D_FORWARD, Convolution3DForward); cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); cb(LOCAL_SHARE_FORWARD, LocalShareForward); cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); cb(BATCH_CONV_FORWARD, BatchConvBiasForward); cb(CONVBIAS_FORWARD, ConvBiasForward); #undef cb // clang-format off #define FOREACH_OPR_TYPE(cb) \ cb(MATRIX_MUL_FORWARD) \ cb(BATCHED_MATRIX_MUL_FORWARD) \ cb(CONVOLUTION_FORWARD) \ cb(CONVOLUTION_BACKWARD_DATA) \ cb(CONVOLUTION_BACKWARD_FILTER) \ cb(CONVOLUTION3D_FORWARD) \ cb(CONVOLUTION3D_BACKWARD_DATA) \ cb(CONVOLUTION3D_BACKWARD_FILTER) \ cb(LOCAL_SHARE_FORWARD) \ cb(LOCAL_SHARE_BACKWARD_DATA) \ cb(LOCAL_SHARE_BACKWARD_FILTER) \ cb(DEFORMABLE_CONV_FORWARD) \ cb(DEFORMABLE_CONV_BACKWARD_DATA) \ cb(DEFORMABLE_CONV_BACKWARD_FILTER) \ cb(BATCH_CONV_FORWARD) \ cb(CONVBIAS_FORWARD) #define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ cb(MATRIX_MUL_FORWARD, stmt) \ cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ cb(CONVOLUTION_FORWARD, stmt) \ cb(CONVOLUTION_BACKWARD_DATA, stmt) \ cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ cb(CONVOLUTION3D_FORWARD, stmt) \ cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ cb(LOCAL_SHARE_FORWARD, stmt) \ cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ cb(DEFORMABLE_CONV_FORWARD, stmt) \ cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ cb(BATCH_CONV_FORWARD, stmt) \ cb(CONVBIAS_FORWARD, stmt) // clang-format on #define _OPR_TYPE_CASE(_opr_type, _stmt) \ case Algorithm::OprType::_opr_type: { \ using _Opr = typename OprFromOprTypeTrait::Opr; \ _stmt; \ break; \ } #define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ for (size_t _item_idx = 0; _item_idx < _search_items.size(); _item_idx++) { \ auto&& _item = _search_items[_item_idx]; \ switch (_item.opr_type) { \ FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ default: \ megdnn_throw("unknown opr_type"); \ } \ } template < typename Opr, size_t arity = OprTrait::arity, bool has_workspace = OprTrait::has_workspace, bool can_deduce_layout = OprTrait::can_deduce_layout> struct OprProxyDefaultImpl : public DeduceLayoutProxy, public ExecProxy { virtual void init(Opr*, const TensorNDArray&) {} virtual ~OprProxyDefaultImpl() {} }; template struct OprProxy : public OprProxyDefaultImpl {}; template struct OprWeightPreprocessProxy : public OprProxyDefaultImpl {}; template struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl {}; template struct OprProxyVectorToSingle {}; template <> struct OprProxy { static void deduce_layout(ElemwiseForward* opr, TensorLayoutArray& layouts) { megdnn_assert(layouts.size() >= 2); auto inp = layouts; inp.pop_back(); opr->deduce_layout(inp, layouts.back()); } static void init(ElemwiseForward*, const TensorNDArray&) {} static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inp = tensors; inp.pop_back(); opr->exec(inp, tensors.back()); } }; template <> struct OprProxy { static void deduce_layout(ElemwiseMultiType* opr, TensorLayoutArray& layouts) { megdnn_assert(layouts.size() >= 2); auto inp = layouts; inp.pop_back(); opr->deduce_layout(inp, layouts.back()); } static void init(ElemwiseMultiType*, const TensorNDArray&) {} static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inp = tensors; inp.pop_back(); opr->exec(inp, tensors.back()); } }; template <> struct OprProxy { WorkspaceWrapper W; static void deduce_layout(ConcatForward* opr, TensorLayoutArray& layouts) { megdnn_assert(layouts.size() >= 2); auto inp = layouts; inp.pop_back(); opr->deduce_layout(inp, layouts.back()); } static void init(ConcatForward*, const TensorNDArray&) {} void exec(ConcatForward* opr, const TensorNDArray& tensors) { if (!W.valid()) { W = WorkspaceWrapper(opr->handle(), 0); } megdnn_assert(tensors.size() >= 2); auto inp = tensors; inp.pop_back(); TensorLayoutArray layouts(tensors.size()); std::transform( tensors.begin(), tensors.end(), layouts.begin(), [](const TensorND& tensor) { return tensor.layout; }); auto inp_layouts = layouts; inp_layouts.pop_back(); W.update(opr->get_workspace_in_bytes(inp_layouts, layouts.back())); auto inp_tensors = tensors; inp_tensors.pop_back(); opr->exec(inp_tensors, tensors.back(), W.workspace()); } }; template <> struct OprProxy { static void deduce_layout(CheckNonFinite* opr, TensorLayoutArray& layouts) { megdnn_assert(layouts.size() >= 2); auto inp = layouts; inp.pop_back(); opr->deduce_layout(inp, layouts.back()); } static void init(CheckNonFinite*, const TensorNDArray&) {} static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inps = tensors; inps.pop_back(); TensorLayoutArray inp_layouts(inps.size()); std::transform( inps.begin(), inps.end(), inp_layouts.begin(), [](const TensorND& tensor) { return tensor.layout; }); WorkspaceWrapper W( opr->handle(), opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); opr->exec(inps, tensors.back(), W.workspace()); } }; template <> struct OprProxy : DeduceLayoutProxy { WorkspaceWrapper W; void init(SplitForward*, const TensorNDArray&) {} void exec(SplitForward* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); if (!W.valid()) { W = WorkspaceWrapper(opr->handle(), 0); } auto out = tensors; out.erase(out.begin()); TensorLayoutArray layouts(tensors.size()); std::transform( tensors.begin(), tensors.end(), layouts.begin(), [](const TensorND& tensor) { return tensor.layout; }); auto out_layouts = layouts; out_layouts.erase(out_layouts.begin()); W.update(opr->get_workspace_in_bytes(layouts.front(), out_layouts)); auto out_tensors = tensors; out_tensors.erase(out_tensors.begin()); opr->exec(tensors.front(), out_tensors, W.workspace()); } }; //! OprProxy impl for tenary oprs with profiling support template struct OprProxyProfilingBase : public DeduceLayoutProxy< Opr, OprTrait::arity, OprTrait::can_deduce_layout> { static constexpr int arity = OprTrait::arity; size_t warmup_times = 10, exec_times = 100; //! whether to enable profiling bool m_profiling; WorkspaceWrapper W; //! target algo setup by profiler; it can also be directly specified by the //! caller ExecutionPolicy target_execution_policy; OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } //! used for alloc tensor for weight preprocess static std::shared_ptr alloc_tensors( Handle* handle, const TensorLayoutArray& layouts) { auto deleter = [handle](TensorNDArray* ptr) { for (auto&& i : *ptr) { auto pdata = static_cast(i.raw_ptr()) + i.layout.span().low_byte; megdnn_free(handle, pdata); } delete ptr; }; std::shared_ptr ret{new TensorNDArray, deleter}; for (size_t i = 0; i < layouts.size(); ++i) { auto span = layouts[i].span(); ret->emplace_back( static_cast(megdnn_malloc(handle, span.dist_byte())) - span.low_byte, layouts[i]); } return ret; } /** * flatten search space in postorder traversal * The subopr search construct a search tree * * A * / \ * B1B2 C * / \ * D1D2D3 E * We use postorder traverse the search tree. * D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A */ static std::vector flatten_search_space( const TensorLayoutArray layouts, const std::string& param, Handle* handle) { megdnn_assert(layouts.size() == arity); auto opr = handle->create_operator(); opr->param() = Algorithm::deserialize_read_pod(param); std::vector ret; for (auto algo_info : AlgoProxy::get_all_algorithms_info_safe(opr.get(), layouts)) { Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); std::vector&& sub_items = algo->get_subopr_list(layouts, opr.get()); FOREACH_OPR_TYPE_DISPATCH(sub_items, { auto space = OprProxyProfilingBase<_Opr>::flatten_search_space( _item.layouts, _item.param, handle); ret.insert(ret.end(), space.begin(), space.end()); }); } ret.push_back({OprTypeFromOprTrait::opr_type, param, layouts}); return ret; } static void construct_execution_policy( const TensorLayoutArray& layouts, const std::string& param, Handle* handle, FastRunCache& cache, ExecutionPolicy& policy) { megdnn_assert(layouts.size() == arity); auto opr = handle->create_operator(); opr->param() = Algorithm::deserialize_read_pod(param); if (!policy.algo.valid()) { policy.algo = cache.get(Algorithm::SearchItem{ OprTypeFromOprTrait::opr_type, param, layouts}); megdnn_assert( policy.algo.valid(), "No cache found, maybe some error occured in " "flatten_search_space or get_subopr_list"); } policy.sub_policy.clear(); Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); std::vector&& sub_items = algo->get_subopr_list(layouts, opr.get()); FOREACH_OPR_TYPE_DISPATCH(sub_items, { policy.sub_policy.push_back({}); OprProxyProfilingBase<_Opr>::construct_execution_policy( _item.layouts, _item.param, handle, cache, policy.sub_policy.back()); }); return; } /** * \brief search and get the best execution_policy */ static void search( const TensorLayoutArray& layouts, const std::string& param, WorkspaceWrapper& workspace_wrapper, Handle* handle, size_t warmup_times, size_t exec_times, FastRunCache& cache) { megdnn_assert(layouts.size() == arity); auto opr = handle->create_operator(); opr->param() = Algorithm::deserialize_read_pod(param); SmallVector sizes_in_bytes; for (const auto& layout : layouts) { sizes_in_bytes.push_back(layout.span().dist_byte()); } float min_time = std::numeric_limits::max(); Algorithm::Info::Desc best_algo; std::string log_info = "Profiling start: "; for (auto&& layout : layouts) { log_info += layout.to_string() + " "; } megdnn_log("%s", log_info.c_str()); best_algo = cache.get(Algorithm::SearchItem{ OprTypeFromOprTrait::opr_type, param, layouts}); if (best_algo.valid()) { auto&& algo = opr->get_algorithm_from_desc(best_algo); MEGDNN_MARK_USED_VAR(algo); megdnn_log("Find best algo %s in cache", algo->name()); return; } for (auto algo : AlgoProxy::get_all_algorithms_info_safe(opr.get(), layouts)) { //! construct execution_policy opr->execution_policy().algo = algo.desc; construct_execution_policy( layouts, param, handle, cache, opr->execution_policy()); auto workspace_size = AlgoProxy::get_workspace_in_bytes(opr.get(), layouts); sizes_in_bytes.push_back(workspace_size); WorkspaceBundle wb(nullptr, sizes_in_bytes); workspace_wrapper.update(wb.total_size_in_bytes()); wb.set(workspace_wrapper.workspace().raw_ptr); TensorNDArray tensors; for (size_t i = 0; i < arity; i++) { tensors.push_back({wb.get(i), layouts[i]}); } for (size_t times = 0; times < warmup_times; ++times) { AlgoProxy::exec( opr.get(), tensors, wb.get_workspace(arity)); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); Timer timer; timer.start(); for (size_t times = 0; times < exec_times; ++times) { AlgoProxy::exec( opr.get(), tensors, wb.get_workspace(arity)); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); timer.stop(); megdnn_log( "%.3fms %s", timer.get_time_in_us() / 1e3, algo.desc.name.c_str()); if (min_time > timer.get_time_in_us()) { min_time = timer.get_time_in_us(); best_algo = algo.desc; } sizes_in_bytes.pop_back(); } auto&& algo = opr->get_algorithm_from_desc(best_algo); MEGDNN_MARK_USED_VAR(algo); megdnn_log("Profiling end, got best algo: %s", algo->name()); cache.put( Algorithm::SearchItem{ OprTypeFromOprTrait::opr_type, param, layouts}, best_algo); } virtual void init(Opr*, const TensorNDArray&) {} virtual void exec(Opr* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() == arity); if (!W.valid()) { W = WorkspaceWrapper(opr->handle(), 0); } TensorLayoutArray layouts; for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } if (m_profiling && !target_execution_policy.algo.valid()) { FastRunCache cache; std::string param_str; Algorithm::serialize_write_pod(opr->param(), param_str); auto&& search_items = flatten_search_space(layouts, param_str, opr->handle()); FOREACH_OPR_TYPE_DISPATCH(search_items, { OprProxyProfilingBase<_Opr>::search( _item.layouts, _item.param, W, opr->handle(), warmup_times, exec_times, cache); }); construct_execution_policy( layouts, param_str, opr->handle(), cache, opr->execution_policy()); target_execution_policy = opr->execution_policy(); auto workspace_size = AlgoProxy::get_workspace_in_bytes(opr, layouts); W.update(workspace_size); } if (!target_execution_policy.algo.valid()) { auto workspace_size = AlgoProxy::get_workspace_in_bytes(opr, layouts); W.update(workspace_size); } AlgoProxy::exec(opr, tensors, W.workspace()); } virtual ~OprProxyProfilingBase() {} }; #define DEF_PROF(c) \ template <> \ struct OprProxy : public OprProxyProfilingBase { \ using OprProxyProfilingBase::OprProxyProfilingBase; \ } DEF_PROF(MatrixMulForward); DEF_PROF(ConvolutionForward); DEF_PROF(ConvolutionBackwardData); DEF_PROF(ConvolutionBackwardFilter); DEF_PROF(LocalShareForward); DEF_PROF(LocalShareBackwardData); DEF_PROF(LocalShareBackwardFilter); DEF_PROF(DeformableConvForward); DEF_PROF(DeformableConvBackwardFilter); DEF_PROF(BatchConvBiasForward); DEF_PROF(ConvBiasForward); DEF_PROF(DeformableConvBackwardData); #undef DEF_PROF template struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { using Base = OprProxyProfilingBase; static constexpr int arity = OprTrait::arity; void exec(Opr* opr, const TensorNDArray& tensors) override { megdnn_assert(tensors.size() == arity); if (!Base::W.valid()) { Base::W = WorkspaceWrapper(opr->handle(), 0); } TensorLayoutArray layouts; for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } if (Base::m_profiling && !Base::target_execution_policy.algo.valid()) { size_t min_time = std::numeric_limits::max(); for (auto algo : AlgoProxy::get_all_algorithms_info_safe(opr, layouts)) { opr->execution_policy().algo = algo.desc; auto preprocess_tensors = weight_prerocess(opr, tensors, algo.desc); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; auto workspace_size = AlgoProxy::get_workspace_in_bytes( opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); for (size_t times = 0; times < Base::warmup_times; ++times) { AlgoProxy::exec( opr, tensors, &preprocessed_filter, Base::W.workspace()); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); Timer timer; timer.start(); for (size_t times = 0; times < Base::exec_times; ++times) { AlgoProxy::exec( opr, tensors, &preprocessed_filter, Base::W.workspace()); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); timer.stop(); printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, algo.desc.name.c_str()); if (min_time > timer.get_time_in_us()) { min_time = timer.get_time_in_us(); Base::target_execution_policy.algo = algo.desc; } } opr->execution_policy() = Base::target_execution_policy; auto preprocess_tensors = weight_prerocess(opr, tensors, Base::target_execution_policy.algo); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; auto workspace_size = AlgoProxy::get_workspace_in_bytes( opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } auto preprocess_tensors = weight_prerocess(opr, tensors, Base::target_execution_policy.algo); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; if (!Base::target_execution_policy.algo.valid()) { auto workspace_size = AlgoProxy::get_workspace_in_bytes( opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } AlgoProxy::exec( opr, tensors, &preprocessed_filter, Base::W.workspace()); //! as preprocess_tensors will call destructor at end of this function, //! sync to wait worker consume preprocess_tensors, to prevent use after free //! case happen megcoreSynchronize(opr->handle()->megcore_computing_handle()); } //! handle weight preprocess std::shared_ptr weight_prerocess( Opr* opr, const TensorNDArray& tensors, const typename Opr::AlgorithmDesc&) { TensorLayoutArray layouts; for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } auto weight_perprocess_layouts = AlgoProxy::deduce_preprocessed_filter_layout(opr, layouts); auto preprocessed_filter_tensors_ptr = Base::alloc_tensors(opr->handle(), weight_perprocess_layouts); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocessed_filter_tensors_ptr}; size_t preprocess_workspace_size = AlgoProxy::get_preprocess_workspace_in_bytes(opr, layouts); WorkspaceWrapper preprocess_workspace(opr->handle(), preprocess_workspace_size); AlgoProxy::exec_preprocess( opr, tensors, layouts, &preprocessed_filter, preprocess_workspace.workspace()); return preprocessed_filter_tensors_ptr; } }; template struct OprWeightPreprocessProxyBenchmarkImpl : public OprWeightPreprocessProxyImpl { using Base = OprProxyProfilingBase; static constexpr int arity = OprTrait::arity; void init(Opr* opr, const TensorNDArray& tensors) override { megdnn_assert(tensors.size() == arity); if (!Base::W.valid()) { Base::W = WorkspaceWrapper(opr->handle(), 0); } TensorLayoutArray layouts; for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } m_preprocessed_tensors = this->weight_prerocess( opr, tensors, Base::target_execution_policy.algo); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *m_preprocessed_tensors}; if (!Base::target_execution_policy.algo.valid()) { auto workspace_size = AlgoProxy::get_workspace_in_bytes( opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } } void exec(Opr* opr, const TensorNDArray& tensors) override { megdnn_assert(tensors.size() == arity); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *m_preprocessed_tensors}; AlgoProxy::exec( opr, tensors, &preprocessed_filter, Base::W.workspace()); } public: std::shared_ptr m_preprocessed_tensors; }; #define DEF_PROF(c) \ template <> \ struct OprWeightPreprocessProxy : public OprWeightPreprocessProxyImpl { \ using OprWeightPreprocessProxyImpl::OprWeightPreprocessProxyImpl; \ }; \ template <> \ struct OprWeightPreprocessBenchmarkProxy \ : public OprWeightPreprocessProxyBenchmarkImpl { \ using OprWeightPreprocessProxyBenchmarkImpl< \ c>::OprWeightPreprocessProxyBenchmarkImpl; \ }; DEF_PROF(ConvolutionForward); DEF_PROF(ConvBias); #undef DEF_PROF } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen