From df356635b74d422cd745a01dc4fc3046a02f9950 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 14 Aug 2020 19:58:58 +0800 Subject: [PATCH] fix(mgb/fallback): delete im2col duplicate code and fix nchw44 usable GitOrigin-RevId: 1aa250e9e715639364746144139d712edd610c6e --- .../arm_common/conv_bias/postprocess_helper.h | 38 +- dnn/src/fallback/conv_bias/im2col/algos.cpp | 527 +++--------------- dnn/src/fallback/conv_bias/im2col/factory.h | 16 +- .../fallback/conv_bias/im2col/im2col_kerns.h | 364 ++++++++++++ .../conv_bias/im2col/strategy_default.cpp | 3 +- .../im2col/strategy_default_nchw44.cpp | 3 +- .../conv_bias/im2col/strategy_nopack.cpp | 3 - 7 files changed, 459 insertions(+), 495 deletions(-) create mode 100644 dnn/src/fallback/conv_bias/im2col/im2col_kerns.h diff --git a/dnn/src/arm_common/conv_bias/postprocess_helper.h b/dnn/src/arm_common/conv_bias/postprocess_helper.h index bcfa718c..539a105f 100644 --- a/dnn/src/arm_common/conv_bias/postprocess_helper.h +++ b/dnn/src/arm_common/conv_bias/postprocess_helper.h @@ -100,6 +100,7 @@ namespace { MIDOUT_END(); \ break; \ default: \ + megdnn_throw("unknow biasmode"); \ break; \ } @@ -282,24 +283,25 @@ struct PostProcess { reinterpret_cast(dst_ptr), bias_type, bias_type, \ dst_type, N* OC* OH* OW* pack_oc_size); -#define FOR_BIAS(_bias_mode, OH, OW) \ - switch (_bias_mode) { \ - case megdnn::BiasMode::NO_BIAS: \ - break; \ - case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS: \ - if (pack_oc_size == 1) { \ - FOR_BINARY_BROADCAST(CONCAT_OP(AddOp)); \ - } else { \ - megdnn_assert(pack_oc_size == 4, \ - "Only support nchw44 in ARM"); \ - FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \ - } \ - break; \ - case megdnn::BiasMode::BIAS: \ - FOR_BINARY(CONCAT_OP(AddOp)); \ - break; \ - default: \ - break; \ +#define FOR_BIAS(_bias_mode, OH, OW) \ + switch (_bias_mode) { \ + case megdnn::BiasMode::NO_BIAS: \ + break; \ + case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS: \ + if (pack_oc_size == 1) { \ + FOR_BINARY_BROADCAST(CONCAT_OP(AddOp)); \ + } else { \ + megdnn_assert(pack_oc_size == 4, \ + "Only support nchw44 in ARM"); \ + FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \ + } \ + break; \ + case megdnn::BiasMode::BIAS: \ + FOR_BINARY(CONCAT_OP(AddOp)); \ + break; \ + default: \ + megdnn_throw("unknow biasmode"); \ + break; \ } template diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp index 0068ef4c..7d5fafc9 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.cpp +++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp @@ -10,6 +10,7 @@ */ #include "src/fallback/conv_bias/im2col/algos.h" +#include "src/fallback/conv_bias/im2col/im2col_kerns.h" #include "src/fallback/conv_bias/im2col/factory.h" #include "megdnn/opr_param_defs.h" #include "src/common/opr_delegate.h" @@ -25,278 +26,6 @@ using namespace megdnn; using namespace fallback; using namespace im2col; -/*======================== AlgoIm2col=======================*/ -/*! - * *\brief The index of all parts workspace in im2col workspace bundel - * *Through witch can convenient get the needed ptr - */ -struct Im2colBundelIndex { - static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; -}; - -using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; -/*! - * *\brief Im2colKerns collects all the im2col kerns in it - */ - -template -class Im2colKerns; - -template <> -class Im2colKerns { -public: - //! conv kernel - static void kerns( - const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, - const ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& - matmul_desc, - StrategyParam strategyparam, - fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, StrategyBase* im2colstrategy) { - size_t OC = param.filter_meta.ocpg; - size_t output_block_size = std::min( - ohw_tile_size, - strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); - size_t output_block_oc_size = std::min( - strategyparam.oc_tile_size, - OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); - - strategyparam.batch_id = ncb_index.ndrange_id[0]; - strategyparam.group_id = ncb_index.ndrange_id[1]; - strategyparam.oc_cur_index = - ncb_index.ndrange_id[3] * - strategyparam.oc_tile_size; - strategyparam.oc_end_index = strategyparam.oc_cur_index + - output_block_oc_size; - strategyparam.ohw_cur_index = - ncb_index.ndrange_id[2] * ohw_tile_size; - strategyparam.output_block_oc_size = output_block_oc_size; - strategyparam.output_block_size = output_block_size; - - bundle_thread.set( - static_cast( - bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + - bundle_thread.total_size_in_bytes() * ncb_index.thread_id); - fallback::MatrixMulImpl::KernParam matmul_param; - static_cast(matmul_param) = - matmul_kernsize_param; - - //! 1.Im2col - im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, - matmul_param, matmul_algo); - - //! 2.packb and matmul compute - im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, - matmul_param, matmul_algo, ncb_index, - matmul_desc); - - //! 3.postprocess and copy dst if need - im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); - } - - WorkspaceBundle get_thread_bundle( - const fallback::ConvBiasImpl::NCBKernSizeParam& param, - const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, - const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, - size_t oc_tile_size) { - size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], - FW = param.filter_meta.spatial[1]; - size_t pack_oc_size = pack_size(param.filter_meta.format); - size_t im2col = 0, packb = 0, bias_temp = 0; - bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT; - megdnn_assert(default_pack, "only support default packa"); - size_t im2col_dst_size = - IC * FH * FW * ohw_tile_size * sizeof(param.src_type); - size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size * - sizeof(param.bias_type); - //! matmul_dst and im2col_dst use the same memory - WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); - packb = wb.get_size(1); - im2col = std::max(im2col_dst_size, matmul_dst_size); - if (param.bias_mode == megdnn::BiasMode::BIAS) { - bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); - } - return {nullptr, {packb, im2col, bias_temp}}; - } -}; - -template <> -class Im2colKerns { -public: - //! conv kernel - static void kerns( - const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, - const ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& - matmul_desc, - StrategyParam strategyparam, - fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, StrategyBase* im2colstrategy) { - size_t OC = param.filter_meta.ocpg; - size_t output_block_size = std::min( - ohw_tile_size, - strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); - size_t output_block_oc_size = std::min( - strategyparam.oc_tile_size, - OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); - - bundle_thread.set( - static_cast( - bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + - bundle_thread.total_size_in_bytes() * ncb_index.thread_id); - - fallback::MatrixMulImpl::KernParam matmul_param; - static_cast(matmul_param) = - matmul_kernsize_param; - - strategyparam.batch_id = ncb_index.ndrange_id[0]; - strategyparam.group_id = ncb_index.ndrange_id[1]; - strategyparam.oc_cur_index = - ncb_index.ndrange_id[3] * - strategyparam.oc_tile_size; - strategyparam.oc_end_index = strategyparam.oc_cur_index + - output_block_oc_size; - strategyparam.ohw_cur_index = - ncb_index.ndrange_id[2] * ohw_tile_size; - strategyparam.output_block_oc_size = output_block_oc_size; - strategyparam.output_block_size = output_block_size; - - //! 1.Im2col - im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, - matmul_param, matmul_algo); - - //! 2.packb and matmul compute - im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, - matmul_param, matmul_algo, ncb_index, - matmul_desc); - - //! 3.postprocess and copy dst if need - im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); - } - WorkspaceBundle get_thread_bundle( - const fallback::ConvBiasImpl::NCBKernSizeParam& param, - const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, - const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, - size_t oc_tile_size) { - size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], - FW = param.filter_meta.spatial[1]; - - size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0; - bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; - megdnn_assert(only_packA, "onlysupport onlypackA mode"); - size_t im2col_dst_size = - IC * FH * FW * ohw_tile_size * sizeof(param.src_type); - size_t matmul_dst_size = - oc_tile_size * ohw_tile_size * sizeof(param.bias_type); - //! matmul_dst and im2col_dst use the same memory - WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); - packb = wb.get_size(1); - im2col = im2col_dst_size; - matmul_dst = matmul_dst_size; - if (param.bias_mode == megdnn::BiasMode::BIAS) { - bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); - } - - return {nullptr, {packb, im2col, matmul_dst, bias_temp}}; - } -}; - -template <> -class Im2colKerns { -public: - //! conv kernel - static void kerns( - const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, - const ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, - const fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& - matmul_desc, - StrategyParam strategyparam, - fallback::ConvBiasImpl::NCBKernIndex ncb_index, - size_t ohw_tile_size, StrategyBase* im2colstrategy) { - size_t OC = param.filter_meta.ocpg; - size_t output_block_size = std::min( - ohw_tile_size, - strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); - size_t output_block_oc_size = std::min( - strategyparam.oc_tile_size, - OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); - - strategyparam.batch_id = ncb_index.ndrange_id[0]; - strategyparam.group_id = ncb_index.ndrange_id[1]; - strategyparam.oc_cur_index = - ncb_index.ndrange_id[3] * - strategyparam.oc_tile_size; - strategyparam.oc_end_index = strategyparam.oc_cur_index + - output_block_oc_size; - strategyparam.ohw_cur_index = - ncb_index.ndrange_id[2] * ohw_tile_size; - strategyparam.output_block_oc_size = output_block_oc_size; - strategyparam.output_block_size = output_block_size; - - bundle_thread.set( - static_cast( - bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + - bundle_thread.total_size_in_bytes() * ncb_index.thread_id); - - fallback::MatrixMulImpl::KernParam matmul_param; - static_cast(matmul_param) = - matmul_kernsize_param; - - //! 1.Im2col - im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, - matmul_param, matmul_algo); - - //! 2.packb and matmul compute - im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, - matmul_param, matmul_algo, ncb_index, - matmul_desc); - - //! 3.postprocess and copy dst if need - im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); - } - WorkspaceBundle get_thread_bundle( - const fallback::ConvBiasImpl::NCBKernSizeParam& param, - const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, - const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, - size_t oc_tile_size) { - size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], - FW = param.filter_meta.spatial[1]; - size_t ohw = param.osz[0] * param.osz[1]; - - size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0; - bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK; - megdnn_assert(no_pack, "only support no pack"); - bool is_dst_8bit = - (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - size_t im2col_dst_size = - IC * FH * FW * ohw_tile_size * sizeof(param.src_type); - size_t matmul_dst_size = - oc_tile_size * ohw_tile_size * sizeof(param.bias_type); - im2col = im2col_dst_size; - if (is_dst_8bit) { - matmul_dst = matmul_dst_size; - } else { - matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size; - } - matmul_compute = matmul_algo->get_workspace(im2col_kern_param); - if (param.bias_mode == megdnn::BiasMode::BIAS) { - bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); - } - - return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}}; - } -}; - namespace { static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param( const fallback::ConvBiasImpl::NCBKernSizeParam& param, @@ -451,7 +180,6 @@ static WorkspaceBundle get_bundle( MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size, size_t ohw_tile_size) { UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(OC); MEGDNN_MARK_USED_VAR(OH); MEGDNN_MARK_USED_VAR(OW); MEGDNN_MARK_USED_VAR(FH); @@ -506,8 +234,9 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace( m_matmul_algo->matmul_description(); size_t oc_tile_size = 0, ohw_tile_size = 0; choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size, - matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n, - m_ohw_tile_size, matmul_desc.packmode); + matmul_desc.innerblocksize.m, + matmul_desc.innerblocksize.n, m_ohw_tile_size, + matmul_desc.packmode); return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size) .total_size_in_bytes(); } @@ -518,20 +247,13 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace( SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) { - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(SH); - MEGDNN_MARK_USED_VAR(SW); - MEGDNN_MARK_USED_VAR(IH); - MEGDNN_MARK_USED_VAR(IW); - MEGDNN_MARK_USED_VAR(FH); - MEGDNN_MARK_USED_VAR(FW); - size_t oc_tile_size = 0, ohw_tile_size = 0; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t OC = param.filter_meta.ocpg; size_t ohw = OH * OW; - size_t GROUP = param.filter_meta.group; - bool need_padding = (PH != 0 || PW != 0); + size_t oc_tile_size = 0, ohw_tile_size = 0; - fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = - m_matmul_algo->matmul_description(); + auto matmul_desc = m_matmul_algo->matmul_description(); bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK; @@ -542,12 +264,8 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( matmul_desc.innerblocksize.n, m_ohw_tile_size, matmul_desc.packmode); - WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size); - size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size); - size_t oc_parallel_times = div_ceil(OC, oc_tile_size); size_t packa_parallel_times = 0; size_t pack_oc_size = pack_size(param.filter_meta.format); - if (only_packA) { packa_parallel_times = div_ceil(OC, oc_tile_size); } else if (default_pack) { @@ -558,9 +276,12 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( auto matmul_param = get_matmul_kern_param( param, ohw_tile_size, default_pack ? OC : oc_tile_size); + WorkspaceBundle bundle = + get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size); WorkspaceBundle bundle_thread = get_thread_bundle(param, m_matmul_algo, matmul_param, matmul_desc, oc_tile_size, ohw_tile_size); + StrategyParam strategyparam; strategyparam.ohw = ohw; strategyparam.is_dst_8bit = @@ -578,138 +299,39 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); SmallVector ret_kern; - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv("ConvBiasImpl::AlgoIm2col::dispatch_kerns"_hash)) { - StrategyBase* im2colstrategy = - Factory::get_im2col_strategy(param, m_matmul_algo); - auto kern_padding = [bundle, im2colstrategy, - pack_oc_size = pack_oc_size]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - im2colstrategy->copy_padding_kern(bundle, param, ncb_index, - pack_oc_size); - }; - - auto kern_packA = [bundle, matmul_algo = m_matmul_algo, - matmul_param, im2colstrategy, - strategyparam = strategyparam, - matmul_desc = matmul_desc]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - - im2colstrategy->packA_kern(bundle, param, matmul_param, - matmul_algo, ncb_index, matmul_desc, - strategyparam); - }; - if (default_pack) { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv( - "ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) { - auto kern_compute_default = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - ohw_tile_size = ohw_tile_size, - strategyparam = strategyparam, - matmul_desc = matmul_desc, im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, - matmul_param, matmul_algo, matmul_desc, - strategyparam, ncb_index, ohw_tile_size, - im2colstrategy); - }; - if (!enable_filter_preprocess) { - ret_kern.push_back( - {kern_packA, {GROUP, packa_parallel_times}}); - } - if (need_padding) { - ret_kern.push_back( - {kern_padding, - {param.n, GROUP, IC / pack_oc_size}}); - } - ret_kern.push_back({kern_compute_default, - {N, GROUP, ohw_parallel_times, - oc_parallel_times}}); - return ret_kern; - } - MIDOUT_END(); - return {}; - } else if (only_packA) { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv( - "ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) { - auto kern_compute_onlypackA = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - strategyparam = strategyparam, - ohw_tile_size = ohw_tile_size, - matmul_desc = matmul_desc, im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, - matmul_param, matmul_algo, matmul_desc, - strategyparam, ncb_index, ohw_tile_size, - im2colstrategy); - }; - if (!enable_filter_preprocess) { - ret_kern.push_back( - {kern_packA, {GROUP, packa_parallel_times}}); - } - if (need_padding) { - ret_kern.push_back( - {kern_padding, {param.n, GROUP, IC}}); - } - ret_kern.push_back({kern_compute_onlypackA, - {N, GROUP, ohw_parallel_times, - oc_parallel_times}}); - return ret_kern; - } - MIDOUT_END(); - return {}; - } else if (no_pack) { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv( - "ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) { - auto kern_compute_nopack = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - strategyparam = strategyparam, - ohw_tile_size = ohw_tile_size, - matmul_desc = matmul_desc, im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, - matmul_param, matmul_algo, matmul_desc, - strategyparam, ncb_index, ohw_tile_size, - im2colstrategy); - }; - if (need_padding) { - ret_kern.push_back( - {kern_padding, {param.n, GROUP, IC}}); - } - ret_kern.push_back({kern_compute_nopack, - {N, GROUP, ohw_parallel_times, - oc_parallel_times}}); - return ret_kern; - } - MIDOUT_END(); - return {}; + StrategyBase* im2colstrategy = + Factory::get_im2col_strategy(param, m_matmul_algo); + if (default_pack) { + MIDOUT_BEGIN(megdnn_fallback_im2col, + midout_iv("dispatch_kerns_default_pack"_hash)) { + return Im2colKerns().get_kerns( + param, bundle, bundle_thread, strategyparam, + matmul_param, im2colstrategy, m_matmul_algo, + ohw_tile_size, oc_tile_size, pack_oc_size); } + MIDOUT_END(); + return {}; + } else if (only_packA) { + MIDOUT_BEGIN(megdnn_fallback_im2col, + midout_iv("dispatch_kerns_onlypacka"_hash)) { + return Im2colKerns().get_kerns( + param, bundle, bundle_thread, strategyparam, + matmul_param, im2colstrategy, m_matmul_algo, + ohw_tile_size, oc_tile_size, pack_oc_size); + } + MIDOUT_END(); + return {}; + } else if (no_pack) { + MIDOUT_BEGIN(megdnn_fallback_im2col, + midout_iv("dispatch_kerns_no_pack"_hash)) { + return Im2colKerns().get_kerns( + param, bundle, bundle_thread, strategyparam, + matmul_param, im2colstrategy, m_matmul_algo, + ohw_tile_size, oc_tile_size, pack_oc_size); + } + MIDOUT_END(); return {}; } - MIDOUT_END(); return {}; } MIDOUT_END(); @@ -721,23 +343,38 @@ bool ConvBiasImpl::AlgoIm2col::usable( AlgoSelectionStrategy /*algo_selection_strategy*/) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 2) { auto format = param.filter_meta.format; + auto matmul_desc = m_matmul_algo->matmul_description(); +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 if (format != param::ConvBias::Format::NCHW && - format != param::ConvBias::Format::NCHW44_DOT && - format != param::ConvBias::Format::NCHW44) { + format != param::ConvBias::Format::NCHW44 && + format != param::ConvBias::Format::NCHW44_DOT) { return false; } - - if(param.src_type.enumv() != param.filter_type.enumv()) { + if (format == param::ConvBias::Format::NCHW44 || + format == param::ConvBias::Format::NCHW44_DOT) { + //! current NCHW44 im2col only support DEFAULT mode matmul + if (matmul_desc.packmode != Pack_Mode::DEFAULT) { + return false; + //! nchw44 hybird mode and channel wise is not support + } else if (param.filter_meta.icpg < 4_z || + param.filter_meta.icpg == 1 || + param.filter_meta.ocpg == 1) { + return false; + } + } +#else + if (format != param::ConvBias::Format::NCHW) { return false; } - - if (param.src_type.enumv() != DTypeEnum::Int8 && - param.src_type.enumv() != DTypeEnum::QuantizedS8 && - param.src_type.enumv() != DTypeEnum::Quantized8Asymm && +#endif + if (param.src_type.enumv() != param.filter_type.enumv() || + (param.src_type.enumv() != DTypeEnum::Int8 && + param.src_type.enumv() != DTypeEnum::QuantizedS8 && + param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 - param.src_type.enumv() != DTypeEnum::Float16 && + param.src_type.enumv() != DTypeEnum::Float16 && #endif - param.src_type.enumv() != DTypeEnum::Float32) { + param.src_type.enumv() != DTypeEnum::Float32)) { return false; } //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode is @@ -750,28 +387,6 @@ bool ConvBiasImpl::AlgoIm2col::usable( return false; } } - fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = - m_matmul_algo->matmul_description(); - //! only matmul's packmode is packa or default support weight preprocess - if (is_enable_filter_preprocess(param) && - (matmul_desc.packmode == - fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { - return false; - } - - if (format == param::ConvBias::Format::NCHW44 || - format == param::ConvBias::Format::NCHW44_DOT) { - //! current NCHW44 im2col only support DEFAULT mode matmul - if (matmul_desc.packmode != Pack_Mode::DEFAULT) { - return false; - //! nchw44 hybird mode and channel wise is not support - } else if (param.filter_meta.icpg < 4_z || - param.filter_meta.icpg == 1 || - param.filter_meta.ocpg == 1) { - return false; - } - } - size_t oc_tile_size = 0, ohw_tile_size = 0; choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, matmul_desc.innerblocksize.m, @@ -798,10 +413,8 @@ bool ConvBiasImpl::AlgoIm2col::usable( SmallVector ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout( const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv( - "ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) { + MIDOUT_BEGIN(megdnn_fallback_im2col, + midout_iv("deduce_preprocessed_filter_layout"_hash)) { fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = m_matmul_algo->matmul_description(); @@ -863,8 +476,6 @@ ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns( packa_parallel_times = div_ceil(OC, matmul_desc.innerblocksize.m); } else { - //! if nopack return null so that OprWeightPreprocessProxy can run - //! with nopack mode return {}; } auto matmul_param = get_matmul_kern_param( diff --git a/dnn/src/fallback/conv_bias/im2col/factory.h b/dnn/src/fallback/conv_bias/im2col/factory.h index b48d4f0d..a5024b50 100644 --- a/dnn/src/fallback/conv_bias/im2col/factory.h +++ b/dnn/src/fallback/conv_bias/im2col/factory.h @@ -26,10 +26,9 @@ enum class StrategyType : uint32_t { FLOAT = 0, #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC FLOAT_FP16 = 1, -#else +#endif #if !MEGDNN_DISABLE_FLOAT16 FLOAT16_FLOAT16 = 2, -#endif #endif INT8x8x32 = 3, INT8x8x16 = 4, @@ -153,12 +152,10 @@ public: cb1(dt_float32, dt_float32, StrategyType::FLOAT); #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16); -#else +#endif #if !MEGDNN_DISABLE_FLOAT16 cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16); #endif -#endif - cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32, StrategyType::INT8x8x32); @@ -256,8 +253,7 @@ public: !param.filter_meta.should_flip) { MIDOUT_BEGIN( megdnn_fallback_im2col_factory_make_strategy, - midout_iv( - "DefaultStrategyType::8x12x1_fuse_packb_s2_nchw44"_hash)) { + midout_iv("8x12x1_fuse_packb_s2_nchw44"_hash)) { return std::make_unique< StrategyFuseXx12x1Nchw44K3x3S2< float, float, @@ -284,14 +280,13 @@ public: cb1(NCHW, DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT, "DefaultStrategyType::FLOAT_FP16"_hash); break; -#else +#endif #if !MEGDNN_DISABLE_FLOAT16 case StrategyType::FLOAT16_FLOAT16: cb1(NCHW, DEFAULT, dt_float16, dt_float16, PostprocessMode::NO_PROCESS, "DefaultStrategyType::FLOAT16_FLOAT16"_hash); break; -#endif #endif case StrategyType::INT8x8x32: if (format == param::ConvBias::Format::NCHW) { @@ -472,15 +467,12 @@ public: cb1(NCHW, NO_PACK, dt_float32, dt_float32, PostprocessMode::FLOAT, "NoPackStrategyType::FLOAT"_hash); break; -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#else #if !MEGDNN_DISABLE_FLOAT16 case StrategyType::FLOAT16_FLOAT16: cb1(NCHW, NO_PACK, dt_float16, dt_float16, PostprocessMode::NO_PROCESS, "NoPackStrategyType::FLOAT16_FLOAT16"_hash); break; -#endif #endif case StrategyType::INT8x8x16: cb3(NCHW, NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8, diff --git a/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h b/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h new file mode 100644 index 00000000..1e080520 --- /dev/null +++ b/dnn/src/fallback/conv_bias/im2col/im2col_kerns.h @@ -0,0 +1,364 @@ +/** + * \file dnn/src/fallback/conv_bias/im2col/im2col_kerns.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/fallback/conv_bias/opr_impl.h" +#include "src/naive/convolution/helper.h" +#include "src/fallback/conv_bias/im2col/factory.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_im2col) + +namespace megdnn { +namespace fallback { +namespace im2col { + +/*! + * *\brief The index of all parts workspace in im2col workspace bundel + * *Through witch can convenient get the needed ptr + */ +struct Im2colBundelIndex { + static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; +}; + +using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; +/*! + * *\brief Im2colKerns collects all the im2col kerns in it + */ +namespace{ +//! conv kernel +static void kerns( + const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread, + const ConvBiasImpl::NCBKernParam& param, + fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, + const fallback::MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + StrategyParam strategyparam, + fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, + StrategyBase* im2colstrategy) { + size_t OC = param.filter_meta.ocpg; + size_t output_block_size = std::min( + ohw_tile_size, + strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); + size_t output_block_oc_size = + std::min(strategyparam.oc_tile_size, + OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); + + bundle_thread.set( + static_cast( + bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); + + fallback::MatrixMulImpl::KernParam matmul_param; + static_cast(matmul_param) = + matmul_kernsize_param; + + strategyparam.batch_id = ncb_index.ndrange_id[0]; + strategyparam.group_id = ncb_index.ndrange_id[1]; + strategyparam.oc_cur_index = + ncb_index.ndrange_id[3] * strategyparam.oc_tile_size; + strategyparam.oc_end_index = + strategyparam.oc_cur_index + output_block_oc_size; + strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; + strategyparam.output_block_oc_size = output_block_oc_size; + strategyparam.output_block_size = output_block_size; + + //! 1.Im2col + im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, + matmul_param, matmul_algo); + + //! 2.packb and matmul compute + im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, + matmul_param, matmul_algo, ncb_index, + matmul_desc); + + //! 3.postprocess and copy dst if need + im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); +} +} // namespace + +template +class Im2colKerns; + +template <> +class Im2colKerns { +public: + SmallVector get_kerns( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread, + const StrategyParam& strategyparam, + fallback::MatrixMulImpl::KernSizeParam& matmul_param, + StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo, + size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) { + auto matmul_desc = matmul_algo->matmul_description(); + auto kern_padding = + [bundle, im2colstrategy, pack_oc_size = pack_oc_size]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->copy_padding_kern(bundle, param, ncb_index, + pack_oc_size); + }; + + auto kern_packA = + [bundle, matmul_algo, matmul_param, im2colstrategy, + strategyparam = strategyparam, matmul_desc = matmul_desc]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->packA_kern(bundle, param, matmul_param, + matmul_algo, ncb_index, + matmul_desc, strategyparam); + }; + auto kern_compute_default = + [bundle, bundle_thread, matmul_param, matmul_algo, + ohw_tile_size, strategyparam, matmul_desc = matmul_desc, + im2colstrategy]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + kerns(bundle, bundle_thread, param, matmul_param, + matmul_algo, matmul_desc, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + size_t PH = param.filter_meta.padding[0]; + size_t PW = param.filter_meta.padding[1]; + size_t GROUP = param.filter_meta.group; + size_t packa_parallel_times = + div_ceil(OC, matmul_desc.innerblocksize.m); + size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size); + size_t oc_parallel_times = div_ceil(OC, oc_tile_size); + SmallVector ret_kern; + if (!is_enable_filter_preprocess(param)) { + ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); + } + if (PH != 0 || PW != 0) { + ret_kern.push_back( + {kern_padding, {BATCH, GROUP, IC / pack_oc_size}}); + } + ret_kern.push_back( + {kern_compute_default, + {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}}); + return ret_kern; + } + + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + size_t pack_oc_size = pack_size(param.filter_meta.format); + size_t im2col = 0, packb = 0, bias_temp = 0; + bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT; + megdnn_assert(default_pack, "only support default packa"); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size * + sizeof(param.bias_type); + //! matmul_dst and im2col_dst use the same memory + WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); + packb = wb.get_size(1); + im2col = std::max(im2col_dst_size, matmul_dst_size); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } + return {nullptr, {packb, im2col, bias_temp}}; + } +}; + +template <> +class Im2colKerns { +public: + SmallVector get_kerns( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread, + const StrategyParam& strategyparam, + fallback::MatrixMulImpl::KernSizeParam& matmul_param, + StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo, + size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) { + auto matmul_desc = matmul_algo->matmul_description(); + auto kern_padding = + [bundle, im2colstrategy, pack_oc_size = pack_oc_size]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->copy_padding_kern(bundle, param, ncb_index, + pack_oc_size); + }; + + auto kern_packA = + [bundle, matmul_algo, matmul_param, im2colstrategy, + strategyparam = strategyparam, matmul_desc = matmul_desc]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->packA_kern(bundle, param, matmul_param, + matmul_algo, ncb_index, + matmul_desc, strategyparam); + }; + auto kern_compute_onlypackA = + [bundle, bundle_thread, matmul_param, matmul_algo, + strategyparam, ohw_tile_size, matmul_desc, im2colstrategy]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + kerns(bundle, bundle_thread, param, matmul_param, + matmul_algo, matmul_desc, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + size_t PH = param.filter_meta.padding[0]; + size_t PW = param.filter_meta.padding[1]; + size_t GROUP = param.filter_meta.group; + size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size); + size_t oc_parallel_times = div_ceil(OC, oc_tile_size); + SmallVector ret_kern; + if (!is_enable_filter_preprocess(param)) { + ret_kern.push_back({kern_packA, {GROUP, oc_parallel_times}}); + } + if (PH != 0 || PW != 0) { + ret_kern.push_back( + {kern_padding, {BATCH, GROUP, IC / pack_oc_size}}); + } + ret_kern.push_back( + {kern_compute_onlypackA, + {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}}); + return ret_kern; + } + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + + size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0; + bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; + megdnn_assert(only_packA, "onlysupport onlypackA mode"); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = + oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + //! matmul_dst and im2col_dst use the same memory + WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); + packb = wb.get_size(1); + im2col = im2col_dst_size; + matmul_dst = matmul_dst_size; + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } + + return {nullptr, {packb, im2col, matmul_dst, bias_temp}}; + } +}; + +template <> +class Im2colKerns { +public: + SmallVector get_kerns( + const ConvBiasImpl::NCBKernSizeParam& param, + WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread, + const StrategyParam& strategyparam, + fallback::MatrixMulImpl::KernSizeParam& matmul_param, + StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo, + size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) { + auto matmul_desc = matmul_algo->matmul_description(); + auto kern_padding = + [bundle, im2colstrategy, pack_oc_size = pack_oc_size]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->copy_padding_kern(bundle, param, ncb_index, + pack_oc_size); + }; + auto kern_compute_nopack = + [bundle, bundle_thread, matmul_param, matmul_algo, + strategyparam, ohw_tile_size, matmul_desc, im2colstrategy]( + const ConvBiasImpl::NCBKernParam& param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + kerns(bundle, bundle_thread, param, matmul_param, + matmul_algo, matmul_desc, strategyparam, ncb_index, + ohw_tile_size, im2colstrategy); + }; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + size_t BATCH = param.n; + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + size_t PH = param.filter_meta.padding[0]; + size_t PW = param.filter_meta.padding[1]; + size_t GROUP = param.filter_meta.group; + size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size); + size_t oc_parallel_times = div_ceil(OC, oc_tile_size); + SmallVector ret_kern; + if (PH != 0 || PW != 0) { + ret_kern.push_back( + {kern_padding, {BATCH, GROUP, IC / pack_oc_size}}); + } + ret_kern.push_back( + {kern_compute_nopack, + {BATCH, GROUP, ohw_parallel_times, oc_parallel_times}}); + return ret_kern; + } + WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + size_t oc_tile_size) { + size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + size_t ohw = param.osz[0] * param.osz[1]; + + size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0; + bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK; + megdnn_assert(no_pack, "only support no pack"); + bool is_dst_8bit = + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + size_t im2col_dst_size = + IC * FH * FW * ohw_tile_size * sizeof(param.src_type); + size_t matmul_dst_size = + oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + im2col = im2col_dst_size; + if (is_dst_8bit) { + matmul_dst = matmul_dst_size; + } else { + matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size; + } + matmul_compute = matmul_algo->get_workspace(im2col_kern_param); + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); + } + + return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}}; + } +}; + +} // namespace im2col +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp index 4b5fb720..911c7d55 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp @@ -192,12 +192,11 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, megdnn::PostprocessMode::FLOAT) -#else +#endif #if !MEGDNN_DISABLE_FLOAT16 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, megdnn::PostprocessMode::NO_PROCESS) #endif -#endif #if MEGDNN_AARCH64 || MEGDNN_ARMV7 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8 diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp index 213a0193..ff4eab52 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp @@ -108,13 +108,12 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, megdnn::PostprocessMode::FLOAT) -#else +#endif #if !MEGDNN_DISABLE_FLOAT16 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, megdnn::PostprocessMode::NO_PROCESS) #endif -#endif #if MEGDNN_AARCH64 || MEGDNN_ARMV7 //! x86 do not have uint8 matmul so only armv7 armv8 support uint8 INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp index c3a05d20..cb574b74 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp @@ -165,13 +165,10 @@ INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, megdnn::PostprocessMode::ADD_BIAS) INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, megdnn::PostprocessMode::ADD_BIAS) -#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#else #if !MEGDNN_DISABLE_FLOAT16 INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, megdnn::PostprocessMode::NO_PROCESS) #endif -#endif #undef INSTANTIAL_CLASS } // namespace megdnn -- GitLab