/** * \file dnn/src/fallback/conv_bias/im2col/algos.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "src/fallback/conv_bias/im2col/algos.h" #include "src/fallback/conv_bias/im2col/factory.h" #include "megdnn/opr_param_defs.h" #include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/common.h" #include "src/fallback/conv_bias/opr_impl.h" #include "src/naive/convolution/helper.h" #include "midout.h" MIDOUT_DECL(megdnn_fallback_im2col) using namespace megdnn; using namespace fallback; using namespace im2col; /*======================== AlgoIm2col=======================*/ /*! * *\brief The index of all parts workspace in im2col workspace bundel * *Through witch can convenient get the needed ptr */ struct Im2colBundelIndex { static constexpr size_t BUNDLE_PADDING_INDEX = 0_z; static constexpr size_t BUNDLE_PACKA_INDEX = 1_z; static constexpr size_t BUNDLE_THREAD_INDEX = 2_z; }; using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode; //! Process one input channel copy padding static void copy_padding_kern(WorkspaceBundle bundle, const ConvBiasImpl::NCBKernParam& param, const ConvBiasImpl::NCBKernIndex& ncb_index, StrategyBase* im2colstrategy, size_t pack_oc_size) { im2colstrategy->copy_padding_kern(bundle, param, ncb_index, pack_oc_size); } //! packA_kern static void packA_kern(WorkspaceBundle bundle, const fallback::ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmulparam, fallback::MatrixMulImpl::AlgoBase* matmul_algo, const fallback::ConvBiasImpl::NCBKernIndex& ncb_index, StrategyBase* im2colstrategy, size_t pack_oc_size) { im2colstrategy->packA_kern(bundle, param, matmulparam, matmul_algo, ncb_index, pack_oc_size); } /*! * *\brief Im2colKerns collects all the im2col kerns in it */ template class Im2colKerns; template <> class Im2colKerns { public: //! conv kernel static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( ohw_tile_size, strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( strategyparam.oc_tile_size, OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); strategyparam.batch_id = ncb_index.ndrange_id[0]; strategyparam.group_id = ncb_index.ndrange_id[1]; strategyparam.oc_cur_index = ncb_index.ndrange_id[3] * strategyparam.oc_tile_size; strategyparam.oc_end_index = strategyparam.oc_cur_index + output_block_oc_size; strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_size = output_block_size; bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; //! 1.Im2col im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, matmul_param, matmul_algo); //! 2.packb and matmul compute im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, matmul_param, matmul_algo, ncb_index); //! 3.postprocess and copy dst if need im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); } WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; size_t pack_oc_size = pack_size(param.filter_meta.format); size_t im2col = 0, packb = 0, bias_temp = 0; bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT; megdnn_assert(default_pack, "only support default packa"); size_t im2col_dst_size = IC * FH * FW * ohw_tile_size * sizeof(param.src_type); size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size * sizeof(param.bias_type); //! matmul_dst and im2col_dst use the same memory WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); packb = wb.get_size(1); im2col = std::max(im2col_dst_size, matmul_dst_size); if (param.bias_mode == megdnn::BiasMode::BIAS) { bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); } return {nullptr, {packb, im2col, bias_temp}}; } }; template <> class Im2colKerns { public: //! conv kernel static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( ohw_tile_size, strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( strategyparam.oc_tile_size, OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; strategyparam.batch_id = ncb_index.ndrange_id[0]; strategyparam.group_id = ncb_index.ndrange_id[1]; strategyparam.oc_cur_index = ncb_index.ndrange_id[3] * strategyparam.oc_tile_size; strategyparam.oc_end_index = strategyparam.oc_cur_index + output_block_oc_size; strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_size = output_block_size; //! 1.Im2col im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, matmul_param, matmul_algo); //! 2.packb and matmul compute im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, matmul_param, matmul_algo, ncb_index); //! 3.postprocess and copy dst if need im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); } WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0; bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; megdnn_assert(only_packA, "onlysupport onlypackA mode"); size_t im2col_dst_size = IC * FH * FW * ohw_tile_size * sizeof(param.src_type); size_t matmul_dst_size = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); //! matmul_dst and im2col_dst use the same memory WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param); packb = wb.get_size(1); im2col = im2col_dst_size; matmul_dst = matmul_dst_size; if (param.bias_mode == megdnn::BiasMode::BIAS) { bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); } return {nullptr, {packb, im2col, matmul_dst, bias_temp}}; } }; template <> class Im2colKerns { public: //! conv kernel static void kerns( WorkspaceBundle bundle, WorkspaceBundle bundle_thread, const ConvBiasImpl::NCBKernParam& param, fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param, fallback::MatrixMulImpl::AlgoBase* matmul_algo, StrategyParam strategyparam, fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size, StrategyBase* im2colstrategy) { size_t OC = param.filter_meta.ocpg; size_t output_block_size = std::min( ohw_tile_size, strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size); size_t output_block_oc_size = std::min( strategyparam.oc_tile_size, OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size); strategyparam.batch_id = ncb_index.ndrange_id[0]; strategyparam.group_id = ncb_index.ndrange_id[1]; strategyparam.oc_cur_index = ncb_index.ndrange_id[3] * strategyparam.oc_tile_size; strategyparam.oc_end_index = strategyparam.oc_cur_index + output_block_oc_size; strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size; strategyparam.output_block_oc_size = output_block_oc_size; strategyparam.output_block_size = output_block_size; bundle.set(param.workspace_ptr); bundle_thread.set( static_cast( bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) + bundle_thread.total_size_in_bytes() * ncb_index.thread_id); fallback::MatrixMulImpl::KernParam matmul_param; static_cast(matmul_param) = matmul_kernsize_param; //! 1.Im2col im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param, matmul_param, matmul_algo); //! 2.packb and matmul compute im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread, matmul_param, matmul_algo, ncb_index); //! 3.postprocess and copy dst if need im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread); } WorkspaceBundle get_thread_bundle( const fallback::ConvBiasImpl::NCBKernSizeParam& param, fallback::MatrixMulImpl::KernSizeParam im2col_kern_param, MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size, size_t oc_tile_size) { size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0], FW = param.filter_meta.spatial[1]; size_t ohw = param.osz[0] * param.osz[1]; size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0; bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK; megdnn_assert(no_pack, "only support no pack"); bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && param.dst_type.enumv() == DTypeEnum::QuantizedS8) || (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); size_t im2col_dst_size = IC * FH * FW * ohw_tile_size * sizeof(param.src_type); size_t matmul_dst_size = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); im2col = im2col_dst_size; if (is_dst_8bit) { matmul_dst = matmul_dst_size; } else { matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size; } matmul_compute = matmul_algo->get_workspace(im2col_kern_param); if (param.bias_mode == megdnn::BiasMode::BIAS) { bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type); } return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}}; } }; fallback::MatrixMulImpl::KernSizeParam ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param, size_t ohw_tile_size, size_t oc_tile_size) const { auto format = param::MatrixMul::Format::DEFAULT; size_t pack_oc_size = pack_size(param.filter_meta.format); if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { format = param::MatrixMul::Format::MK4; } else if(param.filter_meta.format == param::ConvBias::Format::NCHW44_DOT){ format = param::MatrixMul::Format::MK4_DOT; } size_t M = oc_tile_size; size_t N = ohw_tile_size; size_t K = param.filter_meta.icpg * param.filter_meta.spatial[0] * param.filter_meta.spatial[1]; size_t LDA = pack_oc_size * K, LDB = pack_oc_size * N, LDC = N * pack_oc_size; bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && param.dst_type.enumv() == DTypeEnum::QuantizedS8) || (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); return {param.filter_type, param.src_type, is_dst_8bit ? param.bias_type : param.dst_type, M, N, K, LDA, LDB, LDC, false, false, param::MatrixMul::ComputeMode::DEFAULT, format}; } void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block( const NCBKernSizeParam& param, size_t& oc_tile_size, size_t& ohw_tile_size, size_t block_m, size_t block_n, fallback::MatrixMulImpl::AlgoBase::PackMode pack_mode) const { size_t nr_threads = param.nr_threads; size_t OC = param.filter_meta.ocpg; size_t ohw = param.osz[0] * param.osz[1]; oc_tile_size = DEFAULT_OC_TILE_SIZE; ohw_tile_size = m_ohw_tile_size; oc_tile_size = std::min(oc_tile_size, OC); ohw_tile_size = std::min(ohw_tile_size, ohw); if (nr_threads > 1) { if (ohw / ohw_tile_size < nr_threads) { ohw_tile_size = round_up(div_ceil(ohw, nr_threads), block_n); if (ohw_tile_size < DEFAULT_OHW_MIN_TILE_SIZE) { ohw_tile_size = ohw; oc_tile_size = round_up(div_ceil(OC, nr_threads), block_m); if (oc_tile_size > DEFAULT_OC_MAX_TILE_SIZE) { oc_tile_size = DEFAULT_OC_MAX_TILE_SIZE; } else if (oc_tile_size < DEFAULT_OC_MIN_TILE_SIZE) { oc_tile_size = DEFAULT_OC_MIN_TILE_SIZE; } } } } else { //! in no_pack mode don't do block operation when using single thread if (pack_mode == fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { ohw_tile_size = ohw; oc_tile_size = OC; } } } WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle( const NCBKernSizeParam& param) const { UNPACK_CONV_F32_NCB_KERN_SIZES(param); MEGDNN_MARK_USED_VAR(OC); MEGDNN_MARK_USED_VAR(OH); MEGDNN_MARK_USED_VAR(OW); MEGDNN_MARK_USED_VAR(FH); MEGDNN_MARK_USED_VAR(FW); MEGDNN_MARK_USED_VAR(SW); MEGDNN_MARK_USED_VAR(SH); auto IW2 = IH + 2 * PH; auto IH2 = IW + 2 * PW; bool no_need_pading = (PH == 0 && PW == 0); size_t padding = 0, packa_size = 0, packa_group_size = 0; size_t nr_threads = param.nr_threads; size_t GROUP = param.filter_meta.group; bool need_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT; bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA; size_t oc_tile_size = 0, ohw_tile_size = 0; if (need_pack || only_packA) { auto inner_block = m_matmul_algo->get_inner_block_size(); choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, inner_block.m, inner_block.n, m_matmul_algo->packmode()); auto im2col_kern_param = get_matmul_kern_param( param, ohw_tile_size, only_packA ? oc_tile_size : OC); size_t oc_parallel_times = div_ceil(OC, oc_tile_size); WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param); packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0) : wb.get_size(0); } else { //! not support pack,not need pack size_t nopack_default_blockm = 8; size_t nopack_default_blockn = 16; choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, nopack_default_blockm, nopack_default_blockn, m_matmul_algo->packmode()); packa_group_size = 0; } if (no_need_pading) { padding = 0; //! not need padding } else { padding = (GROUP * N * IC * IH2 * IW2) * sizeof(param.src_type); //! for padding } packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size WorkspaceBundle ws = {nullptr, {}}; auto im2col_kern_param = get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { Im2colKerns defaultkern; ws = defaultkern.get_thread_bundle(param, im2col_kern_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { Im2colKerns onlypackakern; ws = onlypackakern.get_thread_bundle(param, im2col_kern_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } else { Im2colKerns nopackkern; ws = nopackkern.get_thread_bundle(param, im2col_kern_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } return {nullptr, {padding, packa_size, ws.total_size_in_bytes() * nr_threads}}; } size_t ConvBiasImpl::AlgoIm2col::get_workspace( ConvBiasImpl*, const NCBKernSizeParam& p) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) { return get_bundle(p).total_size_in_bytes(); } MIDOUT_END(); return 0; } SmallVector ConvBiasImpl::AlgoIm2col::dispatch_kerns( ConvBiasImpl*, const NCBKernSizeParam& param) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) { UNPACK_CONV_F32_NCB_KERN_SIZES(param); MEGDNN_MARK_USED_VAR(SH); MEGDNN_MARK_USED_VAR(SW); MEGDNN_MARK_USED_VAR(IH); MEGDNN_MARK_USED_VAR(IW); MEGDNN_MARK_USED_VAR(FH); MEGDNN_MARK_USED_VAR(FW); size_t oc_tile_size = 0, ohw_tile_size = 0; size_t ohw = OH * OW; size_t GROUP = param.filter_meta.group; WorkspaceBundle bundle = get_bundle(param); WorkspaceBundle bundle_thread = {nullptr, {}}; bool need_padding = (PH != 0 || PW != 0); Pack_Mode packmode = m_matmul_algo->packmode(); bool default_pack = packmode == Pack_Mode::DEFAULT; bool no_pack = packmode == Pack_Mode::NO_PACK; bool only_packA = packmode == Pack_Mode::ONLY_PACKA; if (default_pack || only_packA) { auto inner_block = m_matmul_algo->get_inner_block_size(); choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, inner_block.m, inner_block.n, m_matmul_algo->packmode()); } else { //! nopack_mode size_t nopack_default_blockm = 8; size_t nopack_default_blockn = 16; choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, nopack_default_blockm, nopack_default_blockn, m_matmul_algo->packmode()); } size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size); size_t oc_parallel_times = div_ceil(OC, oc_tile_size); size_t packa_parallel_times = 0; size_t pack_oc_size = pack_size(param.filter_meta.format); if (only_packA) { packa_parallel_times = div_ceil(OC, oc_tile_size); } else if (default_pack) { packa_parallel_times = div_ceil( OC, m_matmul_algo->get_inner_block_size().m); } auto matmul_param = get_matmul_kern_param( param, ohw_tile_size, only_packA ? oc_tile_size : OC); if (m_matmul_algo->packmode() == Pack_Mode::DEFAULT) { Im2colKerns defaultkern; bundle_thread = defaultkern.get_thread_bundle( param, matmul_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } else if (m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA) { Im2colKerns onlypackakern; bundle_thread = onlypackakern.get_thread_bundle( param, matmul_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } else { Im2colKerns nopackkern; bundle_thread = nopackkern.get_thread_bundle( param, matmul_param, m_matmul_algo, ohw_tile_size, oc_tile_size); } StrategyParam strategyparam; strategyparam.ohw = ohw; strategyparam.is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && param.dst_type.enumv() == DTypeEnum::QuantizedS8) || (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); strategyparam.is_ohw_size_bigger = (ohw_tile_size >= ohw); strategyparam.skip_copy_dst = strategyparam.is_ohw_size_bigger && !strategyparam.is_dst_8bit; strategyparam.oc_tile_size = oc_tile_size; strategyparam.pack_oc_size = pack_oc_size; SmallVector ret_kern; MIDOUT_BEGIN( megdnn_fallback_im2col, midout_iv("ConvBiasImpl::AlgoIm2col::dispatch_kerns"_hash)) { StrategyBase* im2colstrategy = Factory::get_im2col_strategy(param, m_matmul_algo); auto kern_padding = [bundle, im2colstrategy, pack_oc_size = pack_oc_size]( const NCBKernParam& param, const NCBKernIndex& ncb_index) { copy_padding_kern(bundle, param, ncb_index, im2colstrategy, pack_oc_size); }; auto kern_packA = [bundle, matmul_algo = m_matmul_algo, matmul_param, im2colstrategy, pack_oc_size = pack_oc_size]( const NCBKernParam& param, const NCBKernIndex& ncb_index) { packA_kern(bundle, param, matmul_param, matmul_algo, ncb_index, im2colstrategy, pack_oc_size); }; if (default_pack) { auto kern_compute_default = [bundle, bundle_thread, matmul_param, matmul_algo = m_matmul_algo, ohw_tile_size = ohw_tile_size, strategyparam = strategyparam, im2colstrategy](const NCBKernParam& param, const NCBKernIndex& ncb_index) { Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, strategyparam, ncb_index, ohw_tile_size, im2colstrategy); }; ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); if (need_padding) { ret_kern.push_back({kern_padding, {param.n, GROUP, IC / pack_oc_size}}); } ret_kern.push_back( {kern_compute_default, {N, GROUP, ohw_parallel_times, oc_parallel_times}}); } else if (only_packA) { auto kern_compute_onlypackA = [bundle, bundle_thread, matmul_param, matmul_algo = m_matmul_algo, strategyparam = strategyparam, ohw_tile_size = ohw_tile_size, im2colstrategy](const NCBKernParam& param, const NCBKernIndex& ncb_index) { Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, strategyparam, ncb_index, ohw_tile_size, im2colstrategy); }; ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}}); if (need_padding) { ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); } ret_kern.push_back( {kern_compute_onlypackA, {N, GROUP, ohw_parallel_times, oc_parallel_times}}); } else if (no_pack) { auto kern_compute_nopack = [bundle, bundle_thread, matmul_param, matmul_algo = m_matmul_algo, strategyparam = strategyparam, ohw_tile_size = ohw_tile_size, im2colstrategy](const NCBKernParam& param, const NCBKernIndex& ncb_index) { Im2colKerns::kerns( bundle, bundle_thread, param, matmul_param, matmul_algo, strategyparam, ncb_index, ohw_tile_size, im2colstrategy); }; if (need_padding) { ret_kern.push_back({kern_padding, {param.n, GROUP, IC}}); } ret_kern.push_back( {kern_compute_nopack, {N, GROUP, ohw_parallel_times, oc_parallel_times}}); } return ret_kern; } MIDOUT_END(); return {}; } MIDOUT_END(); return {}; } bool ConvBiasImpl::AlgoIm2col::usable( ConvBiasImpl* opr, const NCBKernSizeParam& param, AlgoSelectionStrategy /*algo_selection_strategy*/) const { MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 2) { if (opr->param().format != param::ConvBias::Format::NCHW && opr->param().format != param::ConvBias::Format::NCHW44_DOT && opr->param().format != param::ConvBias::Format::NCHW44) { return false; } if (param.src_type.enumv() != param.filter_type.enumv() && param.src_type.enumv() != DTypeEnum::Int8 && param.src_type.enumv() != DTypeEnum::QuantizedS8 && param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 param.src_type.enumv() != DTypeEnum::Float16 && #endif param.src_type.enumv() != DTypeEnum::Float32) { return false; } //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode is //! identity otherwise return false mean that 8x8x32 and 8x8x16 not //! support PostProcess if (param.dst_type.enumv() == DTypeEnum::Int16 || param.dst_type.enumv() == DTypeEnum::Int32 || param.dst_type.enumv() == DTypeEnum::QuantizedS32) { if (param.bias_mode != megdnn::BiasMode::NO_BIAS || param.nonlineMode != megdnn::NonlineMode::IDENTITY) { return false; } } if (opr->param().format == param::ConvBias::Format::NCHW44 || opr->param().format == param::ConvBias::Format::NCHW44_DOT) { //! current NCHW44 im2col only support DEFAULT mode matmul if (m_matmul_algo->packmode() != Pack_Mode::DEFAULT) { return false; //! nchw44 hybird mode and channel wise is not support } else if (param.filter_meta.icpg < 4_z || param.filter_meta.icpg == 1 || param.filter_meta.ocpg == 1) { return false; } } size_t oc_tile_size = 0, ohw_tile_size = 0; Pack_Mode packmode = m_matmul_algo->packmode(); bool default_pack = packmode == Pack_Mode::DEFAULT; bool only_packA = packmode == Pack_Mode::ONLY_PACKA; if (default_pack || only_packA) { auto inner_block = m_matmul_algo->get_inner_block_size(); choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, inner_block.m, inner_block.n, m_matmul_algo->packmode()); } else { //! not support pack,not need pack size_t nopack_default_blockm = 8; size_t nopack_default_blockn = 16; choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size, nopack_default_blockm, nopack_default_blockn, m_matmul_algo->packmode()); } fallback::MatrixMulImpl::KernSizeParam matmul_param = get_matmul_kern_param(param, ohw_tile_size, oc_tile_size); bool matmulusable = m_matmul_algo->usable(matmul_param); return matmulusable && (!(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && param.filter_meta.spatial[0] == 1 && param.filter_meta.stride[0] == param.filter_meta.stride[1] && param.filter_meta.stride[0] == 1)) && (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && param.filter_meta.dilation[0] == 1) && param.compute_mode == param::ConvBias::ComputeMode::DEFAULT; } MIDOUT_END(); return false; } // vim: syntax=cpp.doxygen