From edd7e16701636c70eccdbe0096485fe162ecf2c2 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 2 Jul 2020 10:11:07 +0800 Subject: [PATCH] feat(dnn/fallback): add im2col filterpreprocess function GitOrigin-RevId: 61c54ad258a42301711d3efdae0caef47d7b0584 --- dnn/src/fallback/conv_bias/im2col/algos.cpp | 551 ++++++++++++------ dnn/src/fallback/conv_bias/im2col/algos.h | 33 +- .../fallback/conv_bias/im2col/strategy_base.h | 10 +- .../conv_bias/im2col/strategy_default.cpp | 25 +- .../conv_bias/im2col/strategy_nopack.cpp | 2 +- .../conv_bias/im2col/strategy_onlypacka.cpp | 35 +- dnn/src/fallback/conv_bias/opr_impl.cpp | 3 +- dnn/src/fallback/conv_bias/opr_impl.h | 5 + dnn/src/fallback/convolution/opr_impl.cpp | 4 +- .../arm_common/conv_bias_multi_thread.cpp | 475 ++++++++++++++- dnn/test/common/conv_bias.cpp | 24 + dnn/test/common/conv_bias.h | 5 +- dnn/test/x86/conv_bias.cpp | 287 ++++++++- 13 files changed, 1207 insertions(+), 252 deletions(-) diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp index d79bd62fc..666c099ef 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.cpp +++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp @@ -31,35 +31,10 @@ using namespace im2col; * *Through witch can convenient get the needed ptr */ struct Im2colBundelIndex { - static constexpr size_t BUNDLE_PADDING_INDEX = 0_z; - static constexpr size_t BUNDLE_PACKA_INDEX = 1_z; static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; }; using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; - -//! Process one input channel copy padding -static void copy_padding_kern(WorkspaceBundle& bundle, - const ConvBiasImpl::NCBKernParam& param, - const ConvBiasImpl::NCBKernIndex& ncb_index, - StrategyBase* im2colstrategy, size_t pack_oc_size) { - im2colstrategy->copy_padding_kern(bundle, param, ncb_index, pack_oc_size); -} - -//! packA_kern -static void packA_kern( - WorkspaceBundle& bundle, - const fallback::ConvBiasImpl::NCBKernParam& param, - fallback::MatrixMulImpl::KernSizeParam matmulparam, - fallback::MatrixMulImpl::AlgoBase* matmul_algo, - const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, - StrategyBase* im2colstrategy, - const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, - size_t pack_oc_size) { - im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo, - ncb_index, matmul_desc, pack_oc_size); -} - /*! * *\brief Im2colKerns collects all the im2col kerns in it */ @@ -124,8 +99,8 @@ public: WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, - fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, - MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; @@ -205,8 +180,8 @@ public: } WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, - fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, - MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; @@ -288,8 +263,8 @@ public: } WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, - fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, - MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, + const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param, + const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; @@ -322,15 +297,16 @@ public: } }; -fallback::MatrixMulImpl::KernSizeParam -ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, - size_t ohw_tile_size, - size_t oc_tile_size) const { +namespace { +static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + size_t ohw_tile_size, size_t oc_tile_size) { auto format = param::MatrixMul::Format::DEFAULT; size_t pack_oc_size = pack_size(param.filter_meta.format); if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { format = param::MatrixMul::Format::MK4; - } else if(param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT){ + } else if (param.filter_meta.format == + param::ConvBias::Format::NCHW44_DOT) { format = param::MatrixMul::Format::MK4_DOT; } size_t M = oc_tile_size; @@ -358,10 +334,23 @@ ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, format}; } -void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block( - const NCBKernSizeParam& param, size_t& oc_tile_size, - size_t& ohw_tile_size, size_t block_m, size_t block_n, - fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const { +static void choice_ohw_oc_block( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + size_t& oc_tile_size, size_t& ohw_tile_size, size_t block_m, + size_t block_n, const size_t m_ohw_tile_size, + fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) { + //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, + //! when ohw_tile_size < this value ohw_tile_size = ohw + static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32; + //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, + //! oc_tile_size = DEFAULT_OC_TILE_SIZE + static constexpr size_t DEFAULT_OC_TILE_SIZE = 512; + //! when oc_tile_size > this value m_oc_tile_size = + //! DEFAULT_OC_MAX_TILE_SIZE + static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024; + //! when oc_tile_size < this value oc_tile_size = + //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation + static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128; size_t nr_threads = param.nr_threads; size_t OC = param.filter_meta.ocpg; size_t ohw = param.osz[0] * param.osz[1]; @@ -393,8 +382,74 @@ void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block( } } -WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( - const NCBKernSizeParam& param) const { +static size_t packA_group_size( + const MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::MatrixMulImpl::KernSizeParam& matmul_param, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + size_t packa_parallel_times) { + if (matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { + return matmul_algo->get_bundle(matmul_param).get_size(0); + } else if (matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + return packa_parallel_times * + matmul_algo->get_bundle(matmul_param).get_size(0); + } + megdnn_assert(matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK); + //! nopack mode return 0; + return 0; +} + +static WorkspaceBundle get_thread_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + const MatrixMulImpl::AlgoBase* matmul_algo, + const fallback::MatrixMulImpl::KernSizeParam& matmul_param, + const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, + size_t oc_tile_size, size_t ohw_tile_size) { + if (matmul_desc.packmode == Pack_Mode::DEFAULT) { + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) { + Im2colKerns defaultkern; + return defaultkern.get_thread_bundle(param, matmul_param, + matmul_algo, ohw_tile_size, + oc_tile_size); + } + MIDOUT_END(); + } else if (matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::get_bundle_onlypacka"_hash)) { + Im2colKerns onlypackakern; + return onlypackakern.get_thread_bundle(param, matmul_param, + matmul_algo, ohw_tile_size, + oc_tile_size); + } + MIDOUT_END(); + } else { + megdnn_assert(matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK); + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::get_thread_bundle_nopack"_hash)) { + Im2colKerns nopackkern; + return nopackkern.get_thread_bundle(param, matmul_param, + matmul_algo, ohw_tile_size, + oc_tile_size); + } + MIDOUT_END(); + } + return {nullptr, {}}; +} + +static WorkspaceBundle get_bundle( + const fallback::ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size, + size_t ohw_tile_size) { UNPACK_CONV_F32_NCB_KERN_SIZES(param); MEGDNN_MARK_USED_VAR(OC); MEGDNN_MARK_USED_VAR(OH); @@ -410,23 +465,20 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( size_t padding = 0, packa_size = 0, packa_group_size = 0; size_t nr_threads = param.nr_threads; size_t GROUP = param.filter_meta.group; - fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = - m_matmul_algo->matmul_description(); - bool need_pack = mdesc.packmode == Pack_Mode::DEFAULT; - bool only_packA = mdesc.packmode == Pack_Mode::ONLY_PACKA; - size_t oc_tile_size = 0, ohw_tile_size = 0; - choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, - mdesc.innerblocksize.m, mdesc.innerblocksize.n, - mdesc.packmode); - if (need_pack || only_packA) { - auto im2col_kern_param = get_matmul_kern_param( - param, ohw_tile_size, only_packA ? oc_tile_size : OC); - size_t oc_parallel_times = div_ceil(OC, oc_tile_size); - WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param); - packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0) - : wb.get_size(0); - } else { //! not support pack,not need pack + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = + matmul_algo->matmul_description(); + bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; + + //! packmode is default should use oc + //! packmode is onlypackA should use oc_tile_size + auto im2col_kern_param = get_matmul_kern_param( + param, ohw_tile_size, default_pack ? OC : oc_tile_size); + if (is_enable_filter_preprocess(param)) { packa_group_size = 0; + } else { + size_t oc_parallel_times = div_ceil(OC, oc_tile_size); + packa_group_size = packA_group_size(matmul_algo, im2col_kern_param, + matmul_desc, oc_parallel_times); } if (no_need_pading) { @@ -437,50 +489,27 @@ WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( } packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size - WorkspaceBundle ws = {nullptr, {}}; - auto im2col_kern_param = - get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); - - if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_dft"_hash)) { - Im2colKerns defaultkern; - ws = defaultkern.get_thread_bundle(param, im2col_kern_param, - m_matmul_algo, ohw_tile_size, - oc_tile_size); - } - MIDOUT_END(); - } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_packa"_hash)) { - Im2colKerns onlypackakern; - ws = onlypackakern.get_thread_bundle(param, im2col_kern_param, - m_matmul_algo, ohw_tile_size, - oc_tile_size); - } - MIDOUT_END(); - } else { - MIDOUT_BEGIN( - megdnn_fallback_im2col, - midout_iv("ConvBiasImpl::AlgoIm2col::get_bundle_other"_hash)) { - Im2colKerns nopackkern; - ws = nopackkern.get_thread_bundle(param, im2col_kern_param, - m_matmul_algo, ohw_tile_size, - oc_tile_size); - } - MIDOUT_END(); - } + WorkspaceBundle ws = + get_thread_bundle(param, matmul_algo, im2col_kern_param, + matmul_desc, oc_tile_size, ohw_tile_size); return {nullptr, {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; } +} // namespace + size_t ConvBiasImpl::AlgoIm2col::get_workspace( const NCBKernSizeParam& p) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) { - return get_bundle(p).total_size_in_bytes(); + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = + m_matmul_algo->matmul_description(); + size_t oc_tile_size = 0, ohw_tile_size = 0; + choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size, + matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n, + m_ohw_tile_size, matmul_desc.packmode); + return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size) + .total_size_in_bytes(); } MIDOUT_END(); return 0; @@ -499,22 +528,21 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( size_t oc_tile_size = 0, ohw_tile_size = 0; size_t ohw = OH * OW; size_t GROUP = param.filter_meta.group; - WorkspaceBundle bundle = get_bundle(param); - WorkspaceBundle bundle_thread = {nullptr, {}}; bool need_padding = (PH != 0 || PW != 0); - fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = m_matmul_algo->matmul_description(); - Pack_Mode packmode = mdesc.packmode; - bool default_pack = packmode == Pack_Mode::DEFAULT; - bool no_pack = packmode == Pack_Mode::NO_PACK; - bool only_packA = packmode == Pack_Mode::ONLY_PACKA; - + bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; + bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK; + bool only_packA = matmul_desc.packmode == Pack_Mode::ONLY_PACKA; + bool enable_filter_preprocess = is_enable_filter_preprocess(param); choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, - mdesc.innerblocksize.m, mdesc.innerblocksize.n, - mdesc.packmode); + matmul_desc.innerblocksize.m, + matmul_desc.innerblocksize.n, m_ohw_tile_size, + matmul_desc.packmode); + WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size); size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size); size_t oc_parallel_times = div_ceil(OC, oc_tile_size); size_t packa_parallel_times = 0; @@ -523,28 +551,16 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( if (only_packA) { packa_parallel_times = div_ceil(OC, oc_tile_size); } else if (default_pack) { - packa_parallel_times = div_ceil(OC, mdesc.innerblocksize.m); + packa_parallel_times = + div_ceil(OC, matmul_desc.innerblocksize.m); } auto matmul_param = get_matmul_kern_param( - param, ohw_tile_size, only_packA ? oc_tile_size : OC); - if (mdesc.packmode == Pack_Mode::DEFAULT) { - Im2colKerns defaultkern; - bundle_thread = defaultkern.get_thread_bundle( - param, matmul_param, m_matmul_algo, ohw_tile_size, - oc_tile_size); - } else if (mdesc.packmode == Pack_Mode::ONLY_PACKA) { - Im2colKerns onlypackakern; - bundle_thread = onlypackakern.get_thread_bundle( - param, matmul_param, m_matmul_algo, ohw_tile_size, - oc_tile_size); - } else { - Im2colKerns nopackkern; - bundle_thread = nopackkern.get_thread_bundle( - param, matmul_param, m_matmul_algo, ohw_tile_size, - oc_tile_size); - } + param, ohw_tile_size, default_pack ? OC : oc_tile_size); + WorkspaceBundle bundle_thread = + get_thread_bundle(param, m_matmul_algo, matmul_param, + matmul_desc, oc_tile_size, ohw_tile_size); StrategyParam strategyparam; strategyparam.ohw = ohw; strategyparam.is_dst_8bit = @@ -557,6 +573,9 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit; strategyparam.oc_tile_size = oc_tile_size; strategyparam.pack_oc_size = pack_oc_size; + strategyparam.enable_filter_preprocess = enable_filter_preprocess; + strategyparam.packA_group_size = packA_group_size( + m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); SmallVector ret_kern; MIDOUT_BEGIN( @@ -569,88 +588,126 @@ SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( const NCBKernParam& param, const NCBKernIndex& ncb_index) mutable { bundle.set(param.workspace_ptr); - copy_padding_kern(bundle, param, ncb_index, im2colstrategy, - pack_oc_size); + im2colstrategy->copy_padding_kern(bundle, param, ncb_index, + pack_oc_size); }; auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param, im2colstrategy, - pack_oc_size = pack_oc_size, mdesc = mdesc]( + strategyparam = strategyparam, + matmul_desc = matmul_desc]( const NCBKernParam& param, const NCBKernIndex& ncb_index) mutable { bundle.set(param.workspace_ptr); - packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, - im2colstrategy, mdesc, pack_oc_size); + + im2colstrategy->packA_kern(bundle, param, matmul_param, + matmul_algo, ncb_index, matmul_desc, + strategyparam); }; if (default_pack) { - auto kern_compute_default = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - ohw_tile_size = ohw_tile_size, - strategyparam = strategyparam, matmul_desc = mdesc, - im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, matmul_param, - matmul_algo, matmul_desc, strategyparam, - ncb_index, ohw_tile_size, im2colstrategy); - }; - ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); - - if (need_padding) { - ret_kern.push_back({kern_padding, - {param.n, GROUP, IC / pack_oc_size}}); + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) { + auto kern_compute_default = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + ohw_tile_size = ohw_tile_size, + strategyparam = strategyparam, + matmul_desc = matmul_desc, im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + Im2colKerns::kerns( + bundle, bundle_thread, param, + matmul_param, matmul_algo, matmul_desc, + strategyparam, ncb_index, ohw_tile_size, + im2colstrategy); + }; + if (!enable_filter_preprocess) { + ret_kern.push_back( + {kern_packA, {GROUP, packa_parallel_times}}); + } + if (need_padding) { + ret_kern.push_back( + {kern_padding, + {param.n, GROUP, IC / pack_oc_size}}); + } + ret_kern.push_back({kern_compute_default, + {N, GROUP, ohw_parallel_times, + oc_parallel_times}}); + return ret_kern; } - ret_kern.push_back( - {kern_compute_default, - {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + MIDOUT_END(); + return {}; } else if (only_packA) { - auto kern_compute_onlypackA = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - strategyparam = strategyparam, - ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, - im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, matmul_param, - matmul_algo, matmul_desc, strategyparam, - ncb_index, ohw_tile_size, im2colstrategy); - }; - ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); - if (need_padding) { - ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) { + auto kern_compute_onlypackA = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + strategyparam = strategyparam, + ohw_tile_size = ohw_tile_size, + matmul_desc = matmul_desc, im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + Im2colKerns::kerns( + bundle, bundle_thread, param, + matmul_param, matmul_algo, matmul_desc, + strategyparam, ncb_index, ohw_tile_size, + im2colstrategy); + }; + if (!enable_filter_preprocess) { + ret_kern.push_back( + {kern_packA, {GROUP, packa_parallel_times}}); + } + if (need_padding) { + ret_kern.push_back( + {kern_padding, {param.n, GROUP, IC}}); + } + ret_kern.push_back({kern_compute_onlypackA, + {N, GROUP, ohw_parallel_times, + oc_parallel_times}}); + return ret_kern; } - ret_kern.push_back( - {kern_compute_onlypackA, - {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + MIDOUT_END(); + return {}; } else if (no_pack) { - auto kern_compute_nopack = - [bundle, bundle_thread, matmul_param, - matmul_algo = m_matmul_algo, - strategyparam = strategyparam, - ohw_tile_size = ohw_tile_size, matmul_desc = mdesc, - im2colstrategy]( - const NCBKernParam& param, - const NCBKernIndex& ncb_index) mutable { - bundle.set(param.workspace_ptr); - Im2colKerns::kerns( - bundle, bundle_thread, param, matmul_param, - matmul_algo, matmul_desc, strategyparam, - ncb_index, ohw_tile_size, im2colstrategy); - }; - if (need_padding) { - ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) { + auto kern_compute_nopack = + [bundle, bundle_thread, matmul_param, + matmul_algo = m_matmul_algo, + strategyparam = strategyparam, + ohw_tile_size = ohw_tile_size, + matmul_desc = matmul_desc, im2colstrategy]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + Im2colKerns::kerns( + bundle, bundle_thread, param, + matmul_param, matmul_algo, matmul_desc, + strategyparam, ncb_index, ohw_tile_size, + im2colstrategy); + }; + if (need_padding) { + ret_kern.push_back( + {kern_padding, {param.n, GROUP, IC}}); + } + ret_kern.push_back({kern_compute_nopack, + {N, GROUP, ohw_parallel_times, + oc_parallel_times}}); + return ret_kern; } - ret_kern.push_back( - {kern_compute_nopack, - {N, GROUP, ohw_parallel_times, oc_parallel_times}}); + MIDOUT_END(); + return {}; } - return ret_kern; + return {}; } MIDOUT_END(); return {}; @@ -694,12 +751,19 @@ bool ConvBiasImpl::AlgoIm2col::usable( return false; } } - fallback::MatrixMulImpl::AlgoBase::MatmulDescription mdesc = + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = m_matmul_algo->matmul_description(); + //! only matmul's packmode is packa or default support weight preprocess + if (is_enable_filter_preprocess(param) && + (matmul_desc.packmode == + fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) { + return false; + } + if (format == param::ConvBias::Format::NCHW44 || format == param::ConvBias::Format::NCHW44_DOT) { //! current NCHW44 im2col only support DEFAULT mode matmul - if (mdesc.packmode != Pack_Mode::DEFAULT) { + if (matmul_desc.packmode != Pack_Mode::DEFAULT) { return false; //! nchw44 hybird mode and channel wise is not support } else if (param.filter_meta.icpg < 4_z || @@ -711,8 +775,9 @@ bool ConvBiasImpl::AlgoIm2col::usable( size_t oc_tile_size = 0, ohw_tile_size = 0; choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, - mdesc.innerblocksize.m, mdesc.innerblocksize.n, - m_matmul_algo->packmode()); + matmul_desc.innerblocksize.m, + matmul_desc.innerblocksize.n, m_ohw_tile_size, + matmul_desc.packmode); fallback::MatrixMulImpl::KernSizeParam matmul_param = get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); bool matmulusable = m_matmul_algo->usable(matmul_param); @@ -731,4 +796,104 @@ bool ConvBiasImpl::AlgoIm2col::usable( return false; } +SmallVector +ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout( + const NCBKernSizeParam& param) const { + MIDOUT_BEGIN( + megdnn_fallback_im2col, + midout_iv( + "ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) { + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = + m_matmul_algo->matmul_description(); + + //! only support default_pack and only_packa mode + if (matmul_desc.packmode == Pack_Mode::NO_PACK) { + return {}; + } + + size_t GROUP = param.filter_meta.group; + bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT; + + size_t OC = param.filter_meta.ocpg; + SmallVector preprocessed_layouts; + size_t oc_tile_size = 0, ohw_tile_size = 0; + choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, + matmul_desc.innerblocksize.m, + matmul_desc.innerblocksize.n, m_ohw_tile_size, + matmul_desc.packmode); + auto matmul_param = get_matmul_kern_param( + param, ohw_tile_size, default_pack ? OC : oc_tile_size); + + size_t packa_parallel_times = div_ceil( + OC, default_pack ? matmul_desc.innerblocksize.m : oc_tile_size); + + size_t packa_group_size = packA_group_size( + m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); + preprocessed_layouts.push_back( + {{GROUP, packa_group_size}, dtype::Int8()}); + return preprocessed_layouts; + } + MIDOUT_END(); + return {}; +} + +SmallVector +ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns( + const NCBKernSizeParam& param) const { + MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 3) { + size_t OC = param.filter_meta.ocpg; + size_t oc_tile_size = 0, ohw_tile_size = 0; + size_t GROUP = param.filter_meta.group; + fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc = + m_matmul_algo->matmul_description(); + choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, + matmul_desc.innerblocksize.m, + matmul_desc.innerblocksize.n, m_ohw_tile_size, + matmul_desc.packmode); + WorkspaceBundle bundle = + get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size); + + Pack_Mode packmode = matmul_desc.packmode; + bool default_pack = packmode == Pack_Mode::DEFAULT; + bool only_packA = packmode == Pack_Mode::ONLY_PACKA; + size_t packa_parallel_times = 0; + + if (only_packA) { + packa_parallel_times = div_ceil(OC, oc_tile_size); + } else if (default_pack) { + packa_parallel_times = + div_ceil(OC, matmul_desc.innerblocksize.m); + } else { + //! if nopack return null so that OprWeightPreprocessProxy can run + //! with nopack mode + return {}; + } + auto matmul_param = get_matmul_kern_param( + param, ohw_tile_size, default_pack ? OC : oc_tile_size); + + StrategyParam strategyparam; + strategyparam.enable_filter_preprocess = + is_enable_filter_preprocess(param); + strategyparam.packA_group_size = packA_group_size( + m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times); + SmallVector ret_kern; + StrategyBase* im2colstrategy = + Factory::get_im2col_strategy(param, m_matmul_algo); + + auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param, + im2colstrategy, strategyparam = strategyparam, + matmul_desc = matmul_desc]( + const NCBKernParam& param, + const NCBKernIndex& ncb_index) mutable { + bundle.set(param.workspace_ptr); + im2colstrategy->packA_kern(bundle, param, matmul_param, matmul_algo, + ncb_index, matmul_desc, strategyparam); + }; + ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); + return ret_kern; + } + MIDOUT_END(); + return {}; +} + // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/im2col/algos.h b/dnn/src/fallback/conv_bias/im2col/algos.h index 40f6f557f..b699f571f 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.h +++ b/dnn/src/fallback/conv_bias/im2col/algos.h @@ -22,27 +22,6 @@ namespace megdnn { namespace fallback { class ConvBiasImpl::AlgoIm2col final : public AlgoBase { - //! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion, - //! when m_oc_tile_size < this value m_oc_tile_size = ohw - static constexpr size_t DEFAULT_OHW_MIN_TILE_SIZE = 32; - //! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads, - //! m_oc_tile_size = DEFAULT_OC_TILE_SIZE - static constexpr size_t DEFAULT_OC_TILE_SIZE = 512; - //! when m_oc_tile_size > this value m_oc_tile_size = - //! DEFAULT_OC_MAX_TILE_SIZE - static constexpr size_t DEFAULT_OC_MAX_TILE_SIZE = 1024; - //! when m_oc_tile_size < this value m_oc_tile_size = - //! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation - static constexpr size_t DEFAULT_OC_MIN_TILE_SIZE = 128; - fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param( - const NCBKernSizeParam& param, size_t ohw_tile_size, - size_t oc_tile_size) const; - WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; - void choice_ohw_oc_block( - const NCBKernSizeParam& param, size_t& oc_tile_size, - size_t& ohw_tile_size, size_t block_m, size_t block_n, - fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const; - public: AlgoIm2col(MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size) : m_matmul_algo(matmul_algo), @@ -59,10 +38,16 @@ public: bool usable(const NCBKernSizeParam& param, AlgoSelectionStrategy algo_selection_strategy) const override; size_t get_workspace(const NCBKernSizeParam& param) const override; - SmallVector dispatch_kerns( + SmallVector dispatch_kerns(const NCBKernSizeParam& param) const override; + SmallVector deduce_preprocessed_filter_layout( + const NCBKernSizeParam& param) const override; + size_t get_preprocess_workspace( + const NCBKernSizeParam& /*param*/) const override { + return 0; + } + SmallVector dispatch_preprocess_kerns( const NCBKernSizeParam& param) const override; - bool is_preferred( - const NCBKernSizeParam& param) const override { + bool is_preferred(const NCBKernSizeParam& param) const override { if (param.src_type.category() == DTypeCategory::QUANTIZED) { static CpuOprDelegationStorage<1> storage; auto conv_bias_opr = storage.get(); diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_base.h b/dnn/src/fallback/conv_bias/im2col/strategy_base.h index d52f18e41..7c7279b2f 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_base.h +++ b/dnn/src/fallback/conv_bias/im2col/strategy_base.h @@ -40,9 +40,11 @@ struct StrategyParam { size_t block_n; size_t block_k; size_t pack_oc_size; + size_t packA_group_size; bool skip_copy_dst; bool is_dst_8bit; bool is_ohw_size_bigger; + bool enable_filter_preprocess; }; class StrategyBase { @@ -62,7 +64,7 @@ public: const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desec, - size_t pack_size) = 0; + const StrategyParam& sparam) = 0; virtual void exec_im2col( const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, @@ -296,7 +298,7 @@ public: const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc, - size_t pack_size) override; + const StrategyParam& sparam) override; virtual void exec_im2col( const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, const StrategyParam& sparam, @@ -375,7 +377,7 @@ public: const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, - size_t pack_size) override; + const StrategyParam& sparam) override; void exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, const StrategyParam& sparam, const WorkspaceBundle& bundle, @@ -431,7 +433,7 @@ public: const fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& MDsec, - size_t pack_size) override; + const StrategyParam& sparam) override; void exec_im2col( const WorkspaceBundle& bundle, const WorkspaceBundle& bundle_thread, diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp index c0b1856a2..25d5e6ec9 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_default.cpp @@ -25,19 +25,23 @@ void Strategy(matmul_param) = matmulparam; - size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); size_t packed_per_oc_block_size = round_up(matmul_param.K, matmul_desc.innerblocksize.k) * matmul_desc.innerblocksize.m * matmul_desc.packa_type_size; size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; - int8_t* a_panel = static_cast(bundle.get(BUNDLE_PACKA_INDEX)) + - group_id * packA_group_size + a_panel_offset; + int8_t* tmp_ptr = + sparam.enable_filter_preprocess + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr) + : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); + int8_t* a_panel = + tmp_ptr + group_id * sparam.packA_group_size + a_panel_offset; matmul_param.A_ptr = const_cast(param.filter(group_id)); matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1], @@ -149,15 +153,20 @@ void Strategyget_bundle(matmul_param).get_size(0); + size_t packA_group_size = sparam.packA_group_size; size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size + ncb_index.ndrange_id[3] * packA_per_oc_block_size; void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); - src_ctype* a_panel = reinterpret_cast( - reinterpret_cast(bundle.get(BUNDLE_PACKA_INDEX)) + - a_panel_offset); + int8_t* tmp_ptr = + sparam.enable_filter_preprocess + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr) + : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); + + src_ctype* a_panel = + reinterpret_cast(tmp_ptr + a_panel_offset); src_ctype* b_panel = reinterpret_cast(reinterpret_cast( bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); diff --git a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp index 57db835c0..1ab41b718 100644 --- a/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp +++ b/dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp @@ -26,7 +26,7 @@ void Strategy(matmul_param) = matmulparam; @@ -36,12 +36,17 @@ void Strategyget_bundle(matmul_param).get_size(0); - int8_t* a_panel = static_cast(bundle.get(BUNDLE_PACKA_INDEX)) + - group_id * packA_group_size + a_panel_offset; + + int8_t* tmp_ptr = + sparam.enable_filter_preprocess + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr) + : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); + + int8_t* a_panel = tmp_ptr + + group_id * sparam.packA_group_size + a_panel_offset; matmul_param.A_ptr = const_cast(param.filter(group_id)) + oc_cur_index * matmul_param.K; @@ -60,20 +65,22 @@ void Strategyget_bundle(matmul_param).get_size(0); - a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset; + a_panel_offset = + sparam.group_id * sparam.packA_group_size + a_panel_offset; void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); - src_ctype* a_panel = reinterpret_cast( - reinterpret_cast(bundle.get(BUNDLE_PACKA_INDEX)) + - a_panel_offset); + int8_t* tmp_ptr = + sparam.enable_filter_preprocess + ? static_cast( + param.preprocessed_filter->tensors[0].raw_ptr) + : static_cast(bundle.get(BUNDLE_PACKA_INDEX)); + + src_ctype* a_panel = reinterpret_cast(tmp_ptr + a_panel_offset); src_ctype* b_panel = nullptr; src_ctype* im2col_dst = static_cast( diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index d519fc402..ceb292028 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -154,7 +154,8 @@ void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout, bias{nullptr, bias_layout}; auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, preprocessed_filter); - ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); + //! should not pass workspace_size limit otherwise can not find match algo + ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam) <= workspace.size) { exec_preprocess_with_ncb_kern(fparam, algo); diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index 84fc7198b..b228b5ab0 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -299,6 +299,11 @@ private: const PreprocessedFilter* preprocessed_filter); }; +inline bool is_enable_filter_preprocess( + const ConvBiasImpl::NCBKernSizeParam& param) { + return param.preprocessed_filter && + param.preprocessed_filter->tensors.size() >= 1; +} } // namespace fallback } // namespace megdnn diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index 41f044502..b5248c351 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -109,7 +109,9 @@ void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout, TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, workspace); - ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); + + //! should not pass workspace_size limit otherwise can not find match algo + ConvolutionImpl::Algorithm* algo = get_algorithm(fparam); if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam) <= workspace.size) { exec_preprocess_with_ncb_kern(fparam, algo); diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp index c410c2414..8eb1c033c 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp @@ -1837,6 +1837,21 @@ void checker_conv_bias(std::vector args, Handle* handle, {arg.src, arg.filter, arg.bias, {}, {}}); } } + +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2_PREPROCESS) { +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({1, 2, 3, 4, 5, 6, 7}, 2, false, false, false), \ + handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ + dtype::Float32(), dtype::Float32(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_F32K8X12X1") + cb("IM2COLMATMUL:AARCH64_F32K4X16X1") +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_F32") +#endif +#undef cb +} // clang-format off TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) { #define cb(name) \ @@ -1851,6 +1866,22 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE2) { cb("IM2COLMATMUL:ARMV7_F32") #endif #undef cb + +} + +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1_PREPROCESS) { +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false), \ + handle(), nullptr, 0.001, dtype::Float32(), dtype::Float32(), \ + dtype::Float32(), dtype::Float32(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_F32K8X12X1") + cb("IM2COLMATMUL:AARCH64_F32K4X16X1") +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_F32") +#endif +#undef cb } TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COL_FP32_STRIDE1) { @@ -1899,6 +1930,37 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess(get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ + false, true, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + dtype::QuantizedS8(60.25f), name); \ + check_conv_bias_preprocess( \ + get_conv_bias_args({1}, 2, false, false, false, true, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + dtype::QuantizedS8(60.25f), name); + + float epsilon = 0.001; +#if MEGDNN_AARCH64 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD"); +#else + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8"); + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16"); +#endif +#elif MEGDNN_ARMV7 + epsilon = 1; + cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8"); +#endif +#undef cb +} + + #if __ARM_FEATURE_DOTPROD TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) { @@ -1924,6 +1986,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT) { #endif #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess(get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, \ + false, false, false, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + dtype::QuantizedS8(60.25f), name); \ + checker_conv_bias( \ + get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + dtype::QuantizedS8(60.25f), name); + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); +#endif +#undef cb +} TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_MK4_DOT_S2_FUSE) { UniformIntRNG rng{-50, 50}; @@ -1968,6 +2053,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_S8x8x32_MK4_DOT_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ + true, false, true, false, false, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name); \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \ + false, false, true), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), {}, name); + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); +#endif +#undef cb +} + + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) { UniformIntRNG rng{-50, 50}; @@ -1992,6 +2102,30 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32_MK4_DOT_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, \ + true, false, true, false, false, true), \ + handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \ + dtype::Int32(), {}, name); \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({1}, 2, false, true, true, false, true, \ + false, false, true), \ + handle(), &rng, epsilon, dtype::Int8(), dtype::Int8(), \ + dtype::Int32(), {}, name); + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:AARCH32_INT8_MK4_8X4X4_DOTPROD:96"); +#endif +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_CONV1x1_QUANTIZEDSYM_MK4_DOT) { UniformIntRNG rng{-50, 50}; @@ -2055,6 +2189,41 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM) { #endif #undef cb } + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_IM2COLMATMUL_QUANTIZEDASYM_FILTERPREPROCESS) { + NormalRNG rng(128.f); + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false, \ + true, true), \ + handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + dtype::QuantizedS32(1.2 * 1.3), \ + dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); \ + check_conv_bias_preprocess( \ + get_conv_bias_args({1}, 2, false, false, false, true, true), \ + handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + dtype::QuantizedS32(1.2 * 1.3), \ + dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); + float epsilon = 0.001; +#if MEGDNN_AARCH64 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD"); +#else + cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8"); +#endif +#elif MEGDNN_ARMV7 + epsilon = 1; + cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8"); +#endif +#undef cb +} + #endif #if MEGDNN_AARCH64 || MEGDNN_ARMV7 @@ -2088,6 +2257,39 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32) { #endif #undef cb } + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUINT8x8x32_FILTERPREPROCESS) { + UniformIntRNG rng{-50, 50}; + float epsilon = 0.001; +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ + handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + dtype::QuantizedS32(1.2 * 1.3), {}, name); \ + check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ + handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + dtype::QuantizedS32(1.2 * 1.3), {}, name); + +#if MEGDNN_AARCH64 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X4_DOTPROD"); +#else + cb("IM2COLMATMUL:AARCH64_QUINT8_K8X8X8"); +#endif +#elif MEGDNN_ARMV7 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH32_QUINT8_K4X8X4"); +#endif + cb("IM2COLMATMUL:ARMV7_QUINT8_K4X8X8"); +#endif +#undef cb +} + + TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) { UniformIntRNG rng{-50, 50}; float epsilon = 0.001; @@ -2127,6 +2329,51 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16) { #undef cb #undef cb_nchw44 } + +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_FILTERPREPROCESS) { + UniformIntRNG rng{-50, 50}; + float epsilon = 0.001; +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ + handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \ + dtype::Int16{}, dtype::Int16{}, name); \ + check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ + handle(), &rng, epsilon, dtype::Int8{}, \ + dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \ + name); + +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X16_K8X8X8"); + cb("IM2COLMATMUL:AARCH64_INT8X8X16_K4X4X16"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X8X8"); + cb("IM2COLMATMUL:ARMV7_INT8X8X16_K4X2X16"); +#endif +#undef cb +} + +TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_IM2COLMATMUL_INT8x8x16_NOPACK_FILTERPREPROCESS) { + UniformIntRNG rng{-50, 50}; + float epsilon = 0.001; +#define cb(name) \ + check_conv_bias_preprocess( \ + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true), \ + handle(), &rng, epsilon, dtype::Int8{}, dtype::Int8{}, \ + dtype::Int16{}, dtype::Int16{}, name); \ + check_conv_bias_preprocess(get_conv_bias_args({1}, 2, false, true, true), \ + handle(), &rng, epsilon, dtype::Int8{}, \ + dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, \ + name); + +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARM_COMMON_INT8X8X16"); +#endif +#undef cb +} + #endif #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -2147,6 +2394,31 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16) { dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, \ name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_F16_K8X24X1"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:AARCH32_F16_K4X16X1"); +#endif +#undef cb +} + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP16_FILTERPREPROCESS) { + using namespace conv_bias; + + param::ConvBias cur_param; + + std::vector args = + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, false, false); + std::vector args1 = + get_conv_bias_args({1}, 2, false, false, false); + args.insert(args.begin(), args1.begin(), args1.end()); + + NormalRNG rng(1); +#define cb(name) \ + check_conv_bias_preprocess(args, handle(), &rng, 0.03, dtype::Float16{}, \ + dtype::Float16{}, dtype::Float16{}, \ + dtype::Float16{}, name); + #if MEGDNN_AARCH64 cb("IM2COLMATMUL:AARCH64_F16_K8X24X1"); #elif MEGDNN_ARMV7 @@ -2185,6 +2457,36 @@ void checker_conv_bias_mul_int8x8x32(std::vector args, } } +void checker_conv_bias_int8x8x32_preprocess(std::vector args, + Handle* handle, const char* algo_name) { + using namespace conv_bias; + + Checker> checker( + handle); + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo_name)); + checker.set_dtype(0, dtype::Int8()); + checker.set_dtype(1, dtype::Int8()); + checker.set_dtype(2, dtype::Int32()); + checker.set_dtype(4, dtype::Int32()); + for (auto&& arg : args) { + checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); + } + + UniformIntRNG rng{-50, 50}; + for (auto&& arg : args) { + checker.set_dtype(0, dtype::QuantizedS8(2.5f)) + .set_dtype(1, dtype::QuantizedS8(2.5f)) + .set_dtype(2, dtype::QuantizedS32(6.25f)) + .set_dtype(4, {}) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}, {}, {}}); + } +} + #if MEGDNN_AARCH64 || MEGDNN_ARMV7 #if !__ARM_FEATURE_DOTPROD TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) { @@ -2201,6 +2503,20 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S2_PREPROCESS) { + using namespace conv_bias; + std::vector args = + get_nchw44_conv_bias_args({2, 5, 7}, 2, false, true, true); + +#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); +#else + cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); +#endif +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) { using namespace conv_bias; std::vector args = @@ -2216,6 +2532,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32NCHW44_S1_PREPROCESS) { + using namespace conv_bias; + std::vector args = + get_nchw44_conv_bias_args({3, 4, 6}, 1, false, true, true); + +#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); +#else + cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); +#endif + +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2) { UniformIntRNG rng{-50, 50}; @@ -2234,6 +2565,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S2_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({3, 4, 6}, 2), handle(), &rng, epsilon, \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); + float epsilon = 0.001; +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); +#else + cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); +#endif +#undef cb +} + + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1) { UniformIntRNG rng{-50, 50}; @@ -2252,6 +2602,24 @@ TEST_F(ARM_COMMON_MULTI_THREADS, #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_S1_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({2, 5, 7}, 1), handle(), &rng, epsilon, \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); + float epsilon = 0.001; +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); +#else + cb("IM2COLMATMUL:ARMV7_INT8X8X32_MK4_4X2X16:96"); +#endif +#undef cb +} + #if MEGDNN_AARCH64 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE) { @@ -2266,6 +2634,21 @@ TEST_F(ARM_COMMON_MULTI_THREADS, cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); #undef cb } + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44_FUSE_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({3}, 1), handle(), &rng, epsilon, \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), name); + float epsilon = 0.001; + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16:96"); +#undef cb +} + #endif #endif #endif @@ -2287,6 +2670,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS, cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); #undef cb } + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_IM2COLMATMUL_QUANTIZEDSYM_NCHW44DOT_FUSE_PREPROCESS) { + UniformIntRNG rng{-50, 50}; + +#define cb(name) \ + check_conv_bias_preprocess( \ + get_nchw44_conv_bias_args({3}, 1, false, false, false, false, \ + true, false, false, false), \ + handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + dtype::QuantizedS8(60.25f), name); + float epsilon = 0.001; + cb("IM2COLMATMUL:AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD:96"); +#undef cb +} + #endif #endif @@ -2320,6 +2720,36 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { #undef cb } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) { + using namespace conv_bias; + std::vector args = + get_conv_bias_args({2, 3, 4, 5, 6, 7}, 1, false, true, true); + std::vector args1 = + get_conv_bias_args({1}, 2, false, true, true); + args.insert(args.begin(), args1.begin(), args1.end()); + +#define cb(name) checker_conv_bias_int8x8x32_preprocess(args, handle(), name); + +#if MEGDNN_AARCH64 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X12X4_DOTPROD"); +#else + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K8X8X8"); + cb("IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16"); +#endif +#elif MEGDNN_ARMV7 +#if __ARM_FEATURE_DOTPROD + cb("IM2COLMATMUL:AARCH32_INT8_K6X8X4"); +#endif + cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8"); +#endif + +#if MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_INT8X8X32_K4X2X16"); +#endif +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args( @@ -2331,25 +2761,62 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32) { #endif } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S1_MK4_PACK_F32_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_nchw44_conv_bias_args( + {2, 4, 7}, 1, false, false, false, false, false, true,true); +#define cb(name) \ + check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \ + dtype::Float32(), dtype::Float32(), \ + dtype::Float32(), dtype::Float32(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); +#endif +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args( {3, 5, 6}, 2, false, false, false, false, false, true, true); +#define cb(name) check_conv_bias(args, handle(), name); #if MEGDNN_AARCH64 - check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); + cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); #elif MEGDNN_ARMV7 - check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); + cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); #endif +#undef cb } + +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_nchw44_conv_bias_args( + {3}, 2, false, false, false, false, false, true, true, false); +#define cb(name) \ + check_conv_bias_preprocess(args, handle(), nullptr, 0.001, \ + dtype::Float32(), dtype::Float32(), \ + dtype::Float32(), dtype::Float32(), name); +#if MEGDNN_AARCH64 + cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); +#elif MEGDNN_ARMV7 + cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); +#endif +#undef cb +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_IM2COL_S2_MK4_PACK_F32_FUSE) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args( {3}, 2, false, false, false, false, false, true, true, false); +#define cb(name) check_conv_bias(args, handle(), name); #if MEGDNN_AARCH64 - check_conv_bias(args, handle(), "IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); + cb("IM2COLMATMUL:AARCH64_F32_MK4_K8X12X1"); #elif MEGDNN_ARMV7 - check_conv_bias(args, handle(), "IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); + cb("IM2COLMATMUL:ARMV7_F32_MK4_PACK_4X12"); #endif +#undef cb } /***************************** Conv1x1 Algo Test ***********************/ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) { diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 4f157dac3..31e71aa56 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -1118,6 +1118,30 @@ void checker_conv_bias_int8x8x16(std::vector args, } } +void check_conv_bias_preprocess(std::vector args, + Handle* handle, RNG* rng, float epsilon, + DType type0, DType type1, DType type2, + DType type3, const char* algo_name) { + using namespace conv_bias; + + Checker> checker( + handle); + checker.set_dtype(0, type0); + checker.set_dtype(1, type1); + checker.set_dtype(2, type2); + checker.set_dtype(4, type3); + checker.set_epsilon(epsilon); + if (NULL != rng) { + checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng); + } + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker(algo_name)); + for (auto&& arg : args) { + checker.set_param(arg.param).execs( + {arg.src, arg.filter, arg.bias, {}, {}}); + } +} + void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, param::ConvBias param, Handle* handle, diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h index d928fda37..76dc6be8e 100644 --- a/dnn/test/common/conv_bias.h +++ b/dnn/test/common/conv_bias.h @@ -58,7 +58,10 @@ std::vector get_int8_chwn4_tensorcore_args(size_t kernel_size); std::vector get_int8_nchw44_args(size_t kernel_size, size_t pack_size, bool compute_float32 = false, bool group_mode = false); - +void check_conv_bias_preprocess(std::vector args, + Handle* handle, RNG* rng, float epsilon, + DType type0, DType type1, DType type2, + DType type3, const char* algo_name); template using ConvBiasAlgoChecker = AlgoChecker; diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index cbdb9d2bf..a11804536 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -752,7 +752,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_DIRECT_STRIDE2) { } } -TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) { +TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32) { using namespace conv_bias; std::vector args; @@ -842,6 +842,98 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X) { #undef cb2 } +TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8X8X32_FILTER_PREPROCESS) { + using namespace conv_bias; + std::vector args; + + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t p, NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + //! no bias + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, TensorShape{}); + }; + + for (size_t kernel : {2, 3, 4, 5, 6, 7}) + for (size_t ic : {1, 4, 8, 16}) + for (size_t oc : {1, 4, 8}) + for (size_t p : {0, 2}) + for (size_t size : {20, 21, 24}) + for (NonlineMode nonline_mode : + {NonlineMode::IDENTITY}) { + run(oc, ic, size, size, kernel, p, nonline_mode); + } + //! test OC block + run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY); + + Checker> checker( + handle()); + UniformIntRNG rng{-50, 50}; +#define cb(algo_name) \ + checker.set_before_exec_callback( \ + conv_bias::ConvBiasAlgoChecker(algo_name)); \ + checker.set_dtype(0, dtype::Int8()); \ + checker.set_dtype(1, dtype::Int8()); \ + checker.set_dtype(2, dtype::Int32()); \ + checker.set_dtype(4, dtype::Int32()); \ + for (auto&& arg : args) { \ + checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \ + } \ + for (auto&& arg : args) { \ + checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \ + .set_dtype(1, dtype::QuantizedS8(2.5f)) \ + .set_dtype(2, dtype::QuantizedS32(6.25f)) \ + .set_dtype(4, {}) \ + .set_rng(0, &rng) \ + .set_rng(1, &rng) \ + .set_rng(2, &rng) \ + .set_param(arg.param) \ + .execs({arg.src, arg.filter, {}, {}, {}}); \ + } +#define cb2(algo_name) \ + checker.set_before_exec_callback( \ + conv_bias::ConvBiasAlgoChecker(algo_name)); \ + checker.set_dtype(0, dtype::Int8()); \ + checker.set_dtype(1, dtype::Int8()); \ + checker.set_dtype(2, dtype::Int16()); \ + checker.set_dtype(4, dtype::Int16()); \ + for (auto&& arg : args) { \ + checker.set_param(arg.param).execs({arg.src, arg.filter, {}, {}, {}}); \ + } + +#if MEGDNN_X86_WITH_MKL_DNN + if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) { + cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN"); + } +#endif +#if MEGDNN_X86_WITH_VNNI + if (megdnn::x86::is_supported(x86::SIMDType::VNNI)) { + cb("IM2COLMATMUL:X86_INT8X8X32_VNNI"); + } +#endif + if (megdnn::x86::is_supported(x86::SIMDType::AVX2)) { + cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"); + cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2"); + cb2("IM2COLMATMUL:X86_INT8X8X16_AVX2"); + } + if (::megdnn::x86::is_supported(::megdnn::x86::SIMDType::SSE4_2)) { + cb("IM2COLMATMUL:X86_INT8X8X32_SSE_4X8X2"); + cb2("IM2COLMATMUL:X86_INT8X8X16_SSE"); + } + +#undef cb +#undef cb2 +} + + TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) { using namespace conv_bias; std::vector args; @@ -950,6 +1042,61 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32) { #undef cb } + +TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) { + using namespace conv_bias; + std::vector args; + + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t p, NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + //! no bias + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, TensorShape{}); + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, 1, 1}); + args.emplace_back( + param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1, + (w + 2 * p - kernel) / param.stride_w + 1}); + }; + + for (size_t kernel : {2, 3, 4, 5, 6, 7}) + for (size_t ic : {1, 4, 8, 16}) + for (size_t oc : {1, 4, 8, 16, 300}) + for (size_t p : {0, 2}) + for (size_t size : {8, 24}) + for (NonlineMode nonline_mode : + {NonlineMode::IDENTITY, NonlineMode::RELU}) { + run(oc, ic, size, size, kernel, p, nonline_mode); + } + + run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); + + Checker> checker( + handle()); +#define cb(algo_name) \ + checker.set_before_exec_callback( \ + conv_bias::ConvBiasAlgoChecker(algo_name)); \ + for (auto&& arg : args) { \ + checker.set_param(arg.param).execs( \ + {arg.src, arg.filter, arg.bias, {}, {}}); \ + } + cb("IM2COLMATMUL:X86_F32_BLAS"); + +#undef cb +} + #endif @@ -1020,6 +1167,73 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { #undef cb } +TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA_FILTER_PREPROCESS) { + using namespace conv_bias; + std::vector args; + + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t p, NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + //! no bias + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, TensorShape{}); + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, 1, 1}); + args.emplace_back( + param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, (h + 2 * p - kernel) / param.stride_h + 1, + (w + 2 * p - kernel) / param.stride_w + 1}); + param.sparse = param::ConvBias::Sparse::GROUP; + args.emplace_back(param, TensorShape{1, 2 * ic, h, w}, + TensorShape{2, oc, ic, kernel, kernel}, + TensorShape{}); + args.emplace_back(param, TensorShape{1, 2 * ic, h, w}, + TensorShape{2, oc, ic, kernel, kernel}, + TensorShape{1, oc * 2, 1, 1}); + + args.emplace_back( + param, TensorShape{1, 2 * ic, h, w}, + TensorShape{2, oc, ic, kernel, kernel}, + TensorShape{1, 2 * oc, (h + 2 * param.pad_h - kernel) / 1 + 1, + (w + 2 * param.pad_w - kernel) / 1 + 1}); + }; + + for (size_t kernel : {2, 3, 4, 5, 6, 7}) + for (size_t ic : {1, 4, 8, 16}) + for (size_t oc : {1, 4, 8, 16}) + for (size_t p : {0, 1}) + for (size_t size : {8, 24}) + for (NonlineMode nonline_mode : + {NonlineMode::IDENTITY, NonlineMode::RELU}) { + run(oc, ic, size, size, kernel, p, nonline_mode); + } + + run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); + Checker> checker( + handle()); +#define cb(algo_name) \ + checker.set_before_exec_callback( \ + conv_bias::ConvBiasAlgoChecker(algo_name)); \ + for (auto&& arg : args) { \ + checker.set_param(arg.param).execs( \ + {arg.src, arg.filter, arg.bias, {}, {}}); \ + } + + cb("IM2COLMATMUL:X86_F32_MKL_PACKA:192"); + +#undef cb +} + /**************************** Conv1x1 PackA *************************/ namespace { void checker_conv_bias(std::vector args, Handle* handle, @@ -1169,6 +1383,77 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { #undef cb } +TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) { + using namespace conv_bias; + std::vector args; + + auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, + size_t p, NonlineMode nonline_mode) { + if (w + 2 * p < kernel || h + 2 * p < kernel) + return; + param::ConvBias param; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = p; + param.pad_w = p; + param.nonlineMode = nonline_mode; + + //! no bias + args.emplace_back(param, TensorShape{1, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, TensorShape{}); + //! bias channel + args.emplace_back(param, TensorShape{2, ic, h, w}, + TensorShape{oc, ic, kernel, kernel}, + TensorShape{1, oc, 1, 1}); + }; + + for (size_t kernel : {2, 3, 4, 5, 6, 7}) + for (size_t ic : {1, 4, 8, 16}) + for (size_t oc : {1, 4, 8}) + for (size_t p : {0, 2}) + for (size_t size : {20, 21, 24}) + for (NonlineMode nonline_mode : + {NonlineMode::IDENTITY, NonlineMode::RELU, + NonlineMode::H_SWISH}) { + run(oc, ic, size, size, kernel, p, nonline_mode); + } + run(2046, 8, 20, 20, 3, 1, NonlineMode::IDENTITY); + Checker> checker( + handle()); +#define cb(algo_name) \ + checker.set_before_exec_callback( \ + conv_bias::ConvBiasAlgoChecker(algo_name)); \ + UniformIntRNG rng{-50, 50}; \ + for (auto&& arg : args) { \ + checker.set_dtype(0, dtype::QuantizedS8(2.5f)) \ + .set_dtype(1, dtype::QuantizedS8(2.5f)) \ + .set_dtype(2, dtype::QuantizedS32(6.25f)) \ + .set_dtype(4, dtype::QuantizedS8(60.25)) \ + .set_rng(0, &rng) \ + .set_rng(1, &rng) \ + .set_rng(2, &rng) \ + .set_param(arg.param) \ + .execs({arg.src, arg.filter, {}, {}, {}}); \ + } + +#if MEGDNN_X86_WITH_MKL_DNN + if (x86::is_supported(x86::SIMDType::VNNI)) { + cb("IM2COLMATMUL:X86_INT8X8X32_MKLDNN"); + } +#endif +#if MEGDNN_X86_WITH_VNNI + if (x86::is_supported(x86::SIMDType::VNNI)) { + cb("IM2COLMATMUL:X86_INT8X8X32_VNNI"); + } +#endif + if (x86::is_supported(x86::SIMDType::AVX2)) { + cb("IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"); + } + +#undef cb +} + + TEST_F(X86, CONV_BIAS_MATMUL) { using namespace conv_bias; std::vector args; -- GitLab