From fff2cdc7bb19f1a1f96e5d17a90e250bdaf4e6ea Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 24 Jun 2020 19:15:17 +0800 Subject: [PATCH] feat(dnn/fallback): add winograd weight preprocess GitOrigin-RevId: 4741298e44a94ec439df1a4d372ac9fff2075e3f --- dnn/src/arm_common/conv_bias/f16/algos.cpp | 169 +------ dnn/src/arm_common/conv_bias/f16/algos.h | 69 +-- dnn/src/arm_common/conv_bias/fp32/algos.cpp | 293 ++--------- dnn/src/arm_common/conv_bias/fp32/algos.h | 105 +--- dnn/src/arm_common/conv_bias/int8/algos.cpp | 124 +---- dnn/src/arm_common/conv_bias/int8/algos.h | 47 +- .../conv_bias/int8/direct_nchw44_algo.cpp | 1 - .../winograd_filter_preprocess/opr_impl.cpp | 4 +- dnn/src/fallback/conv_bias/algos.cpp | 54 +-- dnn/src/fallback/conv_bias/common.h | 24 + .../fallback/conv_bias/winograd/winograd.h | 259 ++++++++-- dnn/src/x86/conv_bias/f32/algos.h | 28 +- dnn/src/x86/conv_bias/f32/winograd_algo.cpp | 79 +-- dnn/test/arm_common/conv_bias.cpp | 17 + .../arm_common/conv_bias_multi_thread.cpp | 459 ++++++++++++++++++ dnn/test/x86/conv_bias.cpp | 33 +- 16 files changed, 896 insertions(+), 869 deletions(-) diff --git a/dnn/src/arm_common/conv_bias/f16/algos.cpp b/dnn/src/arm_common/conv_bias/f16/algos.cpp index 47af1e38..a5986d70 100644 --- a/dnn/src/arm_common/conv_bias/f16/algos.cpp +++ b/dnn/src/arm_common/conv_bias/f16/algos.cpp @@ -34,11 +34,9 @@ bool ConvBiasImpl::AlgoFP16WinogradF23::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 0) { using Strategy = winograd::winograd_2x3_4x4_f16; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -63,38 +61,10 @@ bool ConvBiasImpl::AlgoFP16WinogradF23::usable( return false; } -size_t ConvBiasImpl::AlgoFP16WinogradF23::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 1) { - winograd::winograd_2x3_4x4_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP16WinogradF23::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 0, 2) { - winograd::winograd_2x3_4x4_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP16WinogradF23, + winograd::winograd_2x3_4x4_f16, + megdnn_arm_common_winograd_fp16, + param::MatrixMul::Format::DEFAULT); /* ======================= AlgoFP16WinogradF45 ======================== */ @@ -106,11 +76,9 @@ bool ConvBiasImpl::AlgoFP16WinogradF45::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 0) { using Strategy = winograd::winograd_4x5_1x1_f16; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -133,37 +101,11 @@ bool ConvBiasImpl::AlgoFP16WinogradF45::usable( return false; } -size_t ConvBiasImpl::AlgoFP16WinogradF45::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - winograd::winograd_4x5_1x1_f16 strategy(param.src_type, param.filter_type, - param.dst_type); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 1) { - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP16WinogradF45, + winograd::winograd_4x5_1x1_f16, + megdnn_arm_common_winograd_fp16, + param::MatrixMul::Format::DEFAULT); -SmallVector -ConvBiasImpl::AlgoFP16WinogradF45::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 1, 2) { - winograd::winograd_4x5_1x1_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} /* ======================= AlgoFP16WinogradF63 ======================== */ bool ConvBiasImpl::AlgoFP16WinogradF63::usable( @@ -174,11 +116,9 @@ bool ConvBiasImpl::AlgoFP16WinogradF63::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 0) { using Strategy = winograd::winograd_6x3_1x1_f16; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -201,37 +141,10 @@ bool ConvBiasImpl::AlgoFP16WinogradF63::usable( return false; } -size_t ConvBiasImpl::AlgoFP16WinogradF63::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - winograd::winograd_6x3_1x1_f16 strategy(param.src_type, param.filter_type, - param.dst_type); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 1) { - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP16WinogradF63::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 2, 2) { - winograd::winograd_6x3_1x1_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP16WinogradF63, + winograd::winograd_6x3_1x1_f16, + megdnn_arm_common_winograd_fp16, + param::MatrixMul::Format::DEFAULT); /* ======================= AlgoFP16WinogradF23_8x8 ======================== */ @@ -249,8 +162,7 @@ bool ConvBiasImpl::AlgoFP16WinogradF23_8x8::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == PackMode::NO_PACK && @@ -275,39 +187,10 @@ bool ConvBiasImpl::AlgoFP16WinogradF23_8x8::usable( return false; } -size_t ConvBiasImpl::AlgoFP16WinogradF23_8x8::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp16, 3, 1) { - winograd::winograd_2x3_8x8_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP16WinogradF23_8x8::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 2) { - winograd::winograd_2x3_8x8_f16 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP16WinogradF23_8x8, + winograd::winograd_2x3_8x8_f16, + megdnn_arm_common_winograd_fp16, + param::MatrixMul::Format::MK8); /*========================from Convolution=============================*/ diff --git a/dnn/src/arm_common/conv_bias/f16/algos.h b/dnn/src/arm_common/conv_bias/f16/algos.h index 2dea43c9..a429a6db 100644 --- a/dnn/src/arm_common/conv_bias/f16/algos.h +++ b/dnn/src/arm_common/conv_bias/f16/algos.h @@ -22,7 +22,6 @@ public: AlgoFP16WinogradF23(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -30,22 +29,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP16WinogradF45 final : public AlgoBase { @@ -53,7 +37,6 @@ public: AlgoFP16WinogradF45(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -61,30 +44,14 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); - uint32_t m_tile_size; }; class ConvBiasImpl::AlgoFP16WinogradF63 final : public AlgoBase { public: AlgoFP16WinogradF63(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -93,29 +60,13 @@ public: return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP16WinogradF23_8x8 final : public AlgoBase { public: AlgoFP16WinogradF23_8x8(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -123,19 +74,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoF16Direct final : public AlgoBase { diff --git a/dnn/src/arm_common/conv_bias/fp32/algos.cpp b/dnn/src/arm_common/conv_bias/fp32/algos.cpp index 63087636..96efd692 100644 --- a/dnn/src/arm_common/conv_bias/fp32/algos.cpp +++ b/dnn/src/arm_common/conv_bias/fp32/algos.cpp @@ -43,8 +43,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == PackMode::NO_PACK && @@ -69,39 +68,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 1) { - winograd::winograd_2x3_4x4_f strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF23_4x4::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 2) { - winograd::winograd_2x3_4x4_f strategy(param.src_type, param.filter_type, - param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4, + winograd::winograd_2x3_4x4_f, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::MK4); /* ======================= AlgoFP32WinogradF63 ======================== */ @@ -113,11 +83,9 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 0) { using Strategy = winograd::winograd_6x3_1x1_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -140,37 +108,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF63::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF63::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 1) { - winograd::winograd_6x3_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF63::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 2) { - winograd::winograd_6x3_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63, + winograd::winograd_6x3_1x1_f, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::DEFAULT); /* ======================= AlgoFP32WinogradF54 ======================== */ @@ -182,11 +123,9 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 0) { using Strategy = winograd::winograd_5x4_1x1_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -209,37 +148,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF54::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF54::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 1) { - winograd::winograd_5x4_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF54::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 2) { - winograd::winograd_5x4_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF54, + winograd::winograd_5x4_1x1_f, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::DEFAULT); /* ======================= AlgoFP32WinogradF45 ======================== */ @@ -251,11 +163,9 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable( MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 0) { using Strategy = winograd::winograd_4x5_1x1_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, m_tile_size, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -278,37 +188,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF45::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF45::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 1) { - winograd::winograd_4x5_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF45::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 2) { - winograd::winograd_4x5_1x1_f strategy(param.src_type, param.filter_type, - param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF45, + winograd::winograd_4x5_1x1_f, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::DEFAULT); /* ======================= AlgoFP32WinogradF63_4x4 ======================== */ @@ -326,8 +209,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == PackMode::NO_PACK && @@ -354,39 +236,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 1) { - winograd::winograd_6x3_4x4_f strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF63_4x4::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 2) { - winograd::winograd_6x3_4x4_f strategy(param.src_type, param.filter_type, - param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4, + winograd::winograd_6x3_4x4_f, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::MK4); /* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ @@ -404,8 +257,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == @@ -431,41 +283,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, - midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { - winograd::winograd_F23_mk4_f_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, - midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { - winograd::winograd_F23_mk4_f_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4_NCHW44, + winograd::winograd_F23_mk4_f_nchw44, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::MK4); /* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */ @@ -483,8 +304,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == @@ -512,41 +332,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, - midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { - winograd::winograd_F63_mk4_f_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, - midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { - winograd::winograd_F63_mk4_f_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4_NCHW44, + winograd::winograd_F63_mk4_f_nchw44, + megdnn_arm_common_winograd_fp32, + param::MatrixMul::Format::MK4); /* ===================== direct algo ===================== */ MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl); diff --git a/dnn/src/arm_common/conv_bias/fp32/algos.h b/dnn/src/arm_common/conv_bias/fp32/algos.h index 7c1bd692..cda5bf3f 100644 --- a/dnn/src/arm_common/conv_bias/fp32/algos.h +++ b/dnn/src/arm_common/conv_bias/fp32/algos.h @@ -17,13 +17,11 @@ namespace megdnn { namespace arm_common { - class ConvBiasImpl::AlgoFP32WinogradF23_4x4 final : public AlgoBase { public: AlgoFP32WinogradF23_4x4(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -31,18 +29,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF63 final : public AlgoBase { @@ -50,7 +37,6 @@ public: AlgoFP32WinogradF63(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -58,19 +44,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF63_4x4 final : public AlgoBase { @@ -78,7 +52,6 @@ public: AlgoFP32WinogradF63_4x4(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -86,19 +59,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase { @@ -106,7 +67,6 @@ public: AlgoFP32WinogradF54(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -114,19 +74,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF45 final : public AlgoBase { @@ -134,7 +82,6 @@ public: AlgoFP32WinogradF45(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -142,19 +89,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; //===================== NCHW44 Winograd Support =====================// @@ -163,7 +98,6 @@ public: AlgoFP32WinogradF23_4x4_NCHW44( fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -172,18 +106,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44 final : public AlgoBase { @@ -191,7 +114,6 @@ public: AlgoFP32WinogradF63_4x4_NCHW44( fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -200,18 +122,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; // ================================================================= // @@ -329,4 +240,6 @@ public: } // namespace arm_common } // namespace megdnn +#undef MEGDNN_WINOGRAD_ALGO_FUN_DECLARE + // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/algos.cpp b/dnn/src/arm_common/conv_bias/int8/algos.cpp index 7271a422..83248262 100644 --- a/dnn/src/arm_common/conv_bias/int8/algos.cpp +++ b/dnn/src/arm_common/conv_bias/int8/algos.cpp @@ -221,8 +221,7 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8::usable( Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && m_matmul_algo->packmode() == PackMode::NO_PACK && @@ -245,34 +244,11 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8::usable( param.dst_type.enumv() == DTypeEnum::QuantizedS8; } -size_t ConvBiasImpl::AlgoS8WinogradF23_8x8::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - winograd::winograd_2x3_8x8_s8 strategy(param.src_type, param.filter_type, - param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoS8WinogradF23_8x8, + winograd::winograd_2x3_8x8_s8, + megdnn_arm_common_conv_bias_int8, + param::MatrixMul::Format::MK8); -SmallVector -ConvBiasImpl::AlgoS8WinogradF23_8x8::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8, 0, 2) { - winograd::winograd_2x3_8x8_s8 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} //=========================== input int8 compute float32 ========= bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, @@ -290,8 +266,7 @@ bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( is_matmul_usable = m_matmul_algo->usable( megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param)); return is_matmul_usable && m_matmul_algo->packmode() == PackMode::NO_PACK && @@ -320,43 +295,10 @@ bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( return false; } -size_t ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_arm_common_conv_bias_int8, - midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::get_workspace"_hash)) { - winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN( - megdnn_arm_common_conv_bias_int8, - midout_iv( - "arm_common_AlgoS8CF32WinogradF23_4x4::dispatch_kerns"_hash)) { - winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoS8CF32WinogradF23_4x4_NCHW44, + winograd::winograd_2x3_4x4_s8_f32_nchw44, + megdnn_arm_common_conv_bias_int8, + param::MatrixMul::Format::MK4); /* ======================= AlgoS8WinogradF23_8x8_NCHW44 ======================== */ bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( @@ -372,10 +314,8 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( using Strategy = winograd::winograd_2x3_8x8_s8_nchw44; Strategy strategy(param.src_type, param.filter_type, param.dst_type); auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + megdnn::winograd::ConvBias( + strategy, m_tile_size, param) .get_matmul_kern_param(param); bool is_matmul_usable = m_matmul_algo->usable(matmul_param); return is_matmul_usable && @@ -401,41 +341,9 @@ bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( return false; } -size_t ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_arm_common_conv_bias_int8, - midout_iv( - "arm_common_AlgoS8WinogradF23_8x8_NCHW44::get_workspace"_hash)) { - winograd::winograd_2x3_8x8_s8_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoS8WinogradF23_8x8_NCHW44, + winograd::winograd_2x3_8x8_s8_nchw44, + megdnn_arm_common_conv_bias_int8, + param::MatrixMul::Format::MK8); -SmallVector -ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MIDOUT_BEGIN( - megdnn_arm_common_conv_bias_int8, - midout_iv( - "arm_common_AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns"_hash)) { - winograd::winograd_2x3_8x8_s8_nchw44 strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/conv_bias/int8/algos.h b/dnn/src/arm_common/conv_bias/int8/algos.h index 31849f3c..a2dc2b44 100644 --- a/dnn/src/arm_common/conv_bias/int8/algos.h +++ b/dnn/src/arm_common/conv_bias/int8/algos.h @@ -201,7 +201,6 @@ public: AlgoS8WinogradF23_8x8(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -209,20 +208,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; //=======================input int8 compute fp32 output int8============ @@ -231,7 +217,6 @@ public: AlgoS8CF32WinogradF23_4x4_NCHW44( fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -240,20 +225,7 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; //=======================input int8 compute int16 output int8============ @@ -262,7 +234,6 @@ public: AlgoS8WinogradF23_8x8_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -271,20 +242,8 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; - static std::vector - get_avaiable_matmul_algos(const NCBKernSizeParam& param); -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; } // namespace arm_common diff --git a/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp b/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp index 3209d428..0626b23e 100644 --- a/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp +++ b/dnn/src/arm_common/conv_bias/int8/direct_nchw44_algo.cpp @@ -14,7 +14,6 @@ #include "src/arm_common/conv_bias/int8/algos.h" #include "src/arm_common/conv_bias/int8/direct.h" #include "src/arm_common/conv_bias/int8/direct_nchw44_kern.h" -#include "src/arm_common/conv_bias/int8/strategy.h" #include "src/arm_common/elemwise_op.h" #include "src/common/opr_delegate.h" diff --git a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp index 08f6d3d9..47d570c6 100644 --- a/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp +++ b/dnn/src/arm_common/winograd_filter_preprocess/opr_impl.cpp @@ -57,8 +57,8 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, auto run = [=]() { \ _strategy strategy(src.layout.dtype, src.layout.dtype, \ src.layout.dtype); \ - megdnn::winograd::ConvBias<_strategy, _format>( \ - strategy, 1, 1, 1, 1, 1) \ + megdnn::winograd::ConvBias<_strategy, _format>(strategy, \ + 1_z) \ .filter_process(src_ptr, dst_ptr, workspace_ptr, \ OC, IC); \ }; \ diff --git a/dnn/src/fallback/conv_bias/algos.cpp b/dnn/src/fallback/conv_bias/algos.cpp index 70cb6a1c..4ec43d0e 100644 --- a/dnn/src/fallback/conv_bias/algos.cpp +++ b/dnn/src/fallback/conv_bias/algos.cpp @@ -242,11 +242,9 @@ bool ConvBiasImpl::AlgoWinogradF32::usable( MIDOUT_BEGIN(megdnn_fallback_winograd, 1, 0) { using Strategy = fallback::winograd::winograd_2x3_1x1_f; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, UNIT_TILE_SIZE, param.nr_threads, - param.osz[0], param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, UNIT_TILE_SIZE, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || (opr->param().format == @@ -277,8 +275,7 @@ size_t ConvBiasImpl::AlgoWinogradF32::get_workspace( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_f>( - strategy, UNIT_TILE_SIZE, p.nr_threads, p.osz[0], - p.osz[1], p.filter_meta.ocpg) + strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); @@ -294,9 +291,8 @@ ConvBiasImpl::AlgoWinogradF32::dispatch_kerns( param.src_type, param.filter_type, param.dst_type); auto winograd_impl = megdnn::winograd::ConvBias< - fallback::winograd::winograd_2x3_1x1_f>( - strategy, UNIT_TILE_SIZE, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); + fallback::winograd::winograd_2x3_1x1_f>(strategy, + UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); @@ -318,8 +314,7 @@ bool ConvBiasImpl::AlgoWinogradF32_4x4::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, UNIT_TILE_SIZE, param.nr_threads, - param.osz[0], param.osz[1], param.filter_meta.ocpg) + strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || @@ -351,9 +346,8 @@ size_t ConvBiasImpl::AlgoWinogradF32_4x4::get_workspace( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_4x4_f, - param::MatrixMul::Format::MK4>( - strategy, UNIT_TILE_SIZE, p.nr_threads, p.osz[0], - p.osz[1], p.filter_meta.ocpg) + param::MatrixMul::Format::MK4>(strategy, UNIT_TILE_SIZE, + p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); @@ -370,9 +364,7 @@ ConvBiasImpl::AlgoWinogradF32_4x4::dispatch_kerns( auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_4x4_f, - param::MatrixMul::Format::MK4>( - strategy, UNIT_TILE_SIZE, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); + param::MatrixMul::Format::MK4>(strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); @@ -389,11 +381,9 @@ bool ConvBiasImpl::AlgoWinogradQS8::usable( MIDOUT_BEGIN(megdnn_fallback_winograd, 3, 0) { using Strategy = fallback::winograd::winograd_2x3_1x1_qs8; Strategy strategy(param.src_type, param.filter_type, param.dst_type); - auto&& matmul_param = - megdnn::winograd::ConvBias( - strategy, UNIT_TILE_SIZE, param.nr_threads, - param.osz[0], param.osz[1], param.filter_meta.ocpg) - .get_matmul_kern_param(param); + auto&& matmul_param = megdnn::winograd::ConvBias( + strategy, UNIT_TILE_SIZE, param) + .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || @@ -425,8 +415,7 @@ size_t ConvBiasImpl::AlgoWinogradQS8::get_workspace( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_qs8>( - strategy, UNIT_TILE_SIZE, p.nr_threads, p.osz[0], - p.osz[1], p.filter_meta.ocpg) + strategy, UNIT_TILE_SIZE, p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); @@ -443,8 +432,7 @@ ConvBiasImpl::AlgoWinogradQS8::dispatch_kerns( auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_1x1_qs8>( - strategy, UNIT_TILE_SIZE, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); + strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); @@ -466,8 +454,7 @@ bool ConvBiasImpl::AlgoWinogradQS8_8x8::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, UNIT_TILE_SIZE, param.nr_threads, - param.osz[0], param.osz[1], param.filter_meta.ocpg) + strategy, UNIT_TILE_SIZE, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW || @@ -499,9 +486,8 @@ size_t ConvBiasImpl::AlgoWinogradQS8_8x8::get_workspace( p.src_type, p.filter_type, p.dst_type); return megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_8x8_qs8, - param::MatrixMul::Format::MK8>( - strategy, UNIT_TILE_SIZE, p.nr_threads, p.osz[0], - p.osz[1], p.filter_meta.ocpg) + param::MatrixMul::Format::MK8>(strategy, UNIT_TILE_SIZE, + p) .get_workspace_size(p, m_matmul_algo); } MIDOUT_END(); @@ -518,9 +504,7 @@ ConvBiasImpl::AlgoWinogradQS8_8x8::dispatch_kerns( auto winograd_impl = megdnn::winograd::ConvBias< fallback::winograd::winograd_2x3_8x8_qs8, - param::MatrixMul::Format::MK8>( - strategy, UNIT_TILE_SIZE, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); + param::MatrixMul::Format::MK8>(strategy, UNIT_TILE_SIZE, param); return winograd_impl.get_kerns(param, m_matmul_algo); } MIDOUT_END(); diff --git a/dnn/src/fallback/conv_bias/common.h b/dnn/src/fallback/conv_bias/common.h index 29fdae8c..620e0530 100644 --- a/dnn/src/fallback/conv_bias/common.h +++ b/dnn/src/fallback/conv_bias/common.h @@ -138,6 +138,30 @@ using BiasMode = ConvBiasForward::BiasMode; break; \ } +#define MEGDNN_WINOGRAD_ALGO_FUN_DECLARE() \ + bool is_reproducible() const override { return true; } \ + bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, \ + AlgoSelectionStrategy algo_selection_strategy) const override; \ + size_t get_workspace(fallback::ConvBiasImpl*, \ + const NCBKernSizeParam& param) const override; \ + virtual SmallVector dispatch_kerns(fallback::ConvBiasImpl* opr, \ + const NCBKernSizeParam& param) \ + const override; \ + SmallVector deduce_preprocessed_filter_layout( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) \ + const override; \ + size_t get_preprocess_workspace(fallback::ConvBiasImpl*, \ + const NCBKernSizeParam& param) \ + const override; \ + virtual SmallVector dispatch_preprocess_kerns( \ + fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param) \ + const override; \ + \ +private: \ + fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; \ + mutable std::string m_name; \ + uint32_t m_tile_size; + enum class PostprocessMode : uint8_t { FLOAT = 0, ///< support all biasmode and no_nonlinemode NO_PROCESS, /// space_vec(nr_threads, transform_mid_buf_size); + return WorkspaceBundle{nullptr, space_vec}; + } + public: //! Get the m_unit_oc_size, according to the nr_threads and //! output_featuremap_size. When single thread the m_unit_oc_size is set //! 2048 heuristicly, When multi-threads, the m_unit_oc_size is set - //! according to nr_threads and out_featuremap_size - ConvBias(const Strategy& strategy, size_t unit_tile_size, size_t nr_threads, - size_t OH, size_t OW, size_t OC) + //! according to nr_threads and out_featuremap_size + ConvBias(const Strategy& strategy, size_t unit_tile_size, + const NCBKernSizeParam& param) : m_strategy{strategy}, m_unit_tile_size{unit_tile_size} { + size_t nr_threads = param.nr_threads; + size_t OC = param.filter_meta.ocpg; + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; if (nr_threads > 1) { size_t units_h = div_ceil(OH, Strategy::OUTPUT_BLOCK_SIZE); size_t units_w = div_ceil(OW, Strategy::OUTPUT_BLOCK_SIZE); @@ -178,12 +195,55 @@ public: m_unit_oc_size = UNIT_OC_SIZE_DEFAULT; } } + ConvBias(const Strategy& strategy, size_t unit_tile_size) + : m_strategy{strategy}, m_unit_tile_size{unit_tile_size} { + m_unit_oc_size = UNIT_OC_SIZE_DEFAULT; + } size_t get_workspace_size( const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase* matmul_algo) const { return get_wbundle(param, matmul_algo).total_size_in_bytes(); } + + size_t get_preprocess_workspace_size( + const NCBKernSizeParam& param, + fallback::MatrixMulImpl::AlgoBase*) const { + return get_preprocess_wbundle(param).total_size_in_bytes(); + } + + SmallVector deduce_preprocessed_filter_layout( + const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase*) { + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + size_t GROUP = param.filter_meta.group; + SmallVector preprocessed_layouts; + DType dtype = m_strategy.filter_dtype; + if (dtype.category() == DTypeCategory::QUANTIZED) { + if (format == param::MatrixMul::Format::MK4) { + dtype = dtype::Float32(); + } else if (format == param::MatrixMul::Format::MK8) { + dtype = dtype::Int16(); + } + } + if (format == param::MatrixMul::Format::DEFAULT) { + preprocessed_layouts.push_back( + {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC, IC}, dtype}); + } else if (format == param::MatrixMul::Format::MK4) { + preprocessed_layouts.push_back( + {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC / 4, IC / 4, + 4, 4}, + dtype}); + } else { + megdnn_assert(format == param::MatrixMul::Format::MK8); + preprocessed_layouts.push_back( + {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC / 8, IC / 8, + 8, 8}, + dtype}); + } + return preprocessed_layouts; + } + //! Used by winograd_filter_preprocess opr void filter_process(const stype* filter_ptr, input_filter_compute_type* filter_transform_buf, @@ -199,7 +259,6 @@ public: const WorkspaceBundle& bundle_compute, const NCBKernParam& kern_param, const NCBKernIndex& ncb_index) { - size_t compute_workspace_size_per_thread = bundle_compute.total_size_in_bytes(); size_t thread_id = ncb_index.thread_id; @@ -235,6 +294,47 @@ public: IC, oc_start, oc_end); } + static void filter_preprocess(Strategy strategy, + const WorkspaceBundle& bundle, + const TensorND& preprocessed_tensor, + const NCBKernParam& kern_param, + const NCBKernIndex& ncb_index) { + size_t thread_id = ncb_index.thread_id; + size_t oc_id = ncb_index.ndrange_id[1]; + size_t group_id = ncb_index.ndrange_id[0]; + size_t OC = kern_param.filter_meta.ocpg; + size_t IC = kern_param.filter_meta.icpg; + size_t filter_group_size = Strategy::ALPHA * Strategy::ALPHA * OC * IC * + sizeof(input_filter_compute_type); + //! Filter trans dst ptr + input_filter_compute_type* filter_transform_buf = + reinterpret_cast( + reinterpret_cast( + preprocessed_tensor.raw_ptr) + + group_id * filter_group_size); + //! Filter trans src ptr + input_filter_compute_type* transform_mid_buf = + reinterpret_cast( + reinterpret_cast(bundle.get(thread_id))); + + const stype* filter_ptr = kern_param.filter(group_id); + size_t oc_start, oc_end; + + if (kern_param.filter_meta.format == param::ConvBias::Format::NCHW88) { + oc_start = 8 * oc_id; + oc_end = oc_start + 8; + } else if (kern_param.filter_meta.format == + param::ConvBias::Format::NCHW44) { + oc_start = 4 * oc_id; + oc_end = oc_start + 4; + } else { + oc_start = oc_id; + oc_end = oc_id + 1; + } + strategy.filter(filter_ptr, filter_transform_buf, transform_mid_buf, OC, + IC, oc_start, oc_end); + } + static void winograd_compute( Strategy strategy, const WorkspaceBundle& bundle_top, const WorkspaceBundle& bundle_compute, @@ -287,15 +387,28 @@ public: compute_workspace_size_per_thread * thread_id); //! NCHW88_WINOGRAD and NCHW_WINOGRAD is the same offset - const input_filter_compute_type* filter_transform_buf = - static_cast( - ncb_param.filter(group_id)); - if (ncb_param.filter_meta.format == param::ConvBias::Format::NCHW || - ncb_param.filter_meta.format == param::ConvBias::Format::NCHW88 || - ncb_param.filter_meta.format == param::ConvBias::Format::NCHW44) { + const input_filter_compute_type* filter_transform_buf = nullptr; + if (nullptr != ncb_param.preprocessed_filter) { + auto preprocess_raw_ptr = + ncb_param.preprocessed_filter->tensors[0].raw_ptr; filter_transform_buf = reinterpret_cast( - reinterpret_cast(bundle_top.get(1)) + + reinterpret_cast(preprocess_raw_ptr) + group_id * filter_group_size); + } else { + filter_transform_buf = + static_cast( + ncb_param.filter( + group_id)); + if (ncb_param.filter_meta.format == param::ConvBias::Format::NCHW || + ncb_param.filter_meta.format == + param::ConvBias::Format::NCHW88 || + ncb_param.filter_meta.format == + param::ConvBias::Format::NCHW44) { + filter_transform_buf = + reinterpret_cast( + reinterpret_cast(bundle_top.get(1)) + + group_id * filter_group_size); + } } //! prepare matmul param matmul_param.workspace_ptr = reinterpret_cast( @@ -371,6 +484,47 @@ public: oc_start_idx, oc_end_idx, unit_start_idx, nr_tiles_in_unit); }; + SmallVector get_preprocess_kerns( + const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase*) { + megdnn_assert( + param.filter_meta.format == param::ConvBias::Format::NCHW || + param.filter_meta.format == param::ConvBias::Format::NCHW88 || + param.filter_meta.format == param::ConvBias::Format::NCHW44); + megdnn_assert(param.preprocessed_filter && + param.preprocessed_filter->tensors.size() > 0); + size_t OC = param.filter_meta.ocpg; + size_t GROUP = param.filter_meta.group; + const TensorND& preprocessed_dst = + param.preprocessed_filter->tensors[0]; + WorkspaceBundle bundle = get_preprocess_wbundle(param); + + Strategy strategy = m_strategy; + SmallVector kerns; + auto filter_process_kern = + [strategy, bundle, &preprocessed_dst]( + const NCBKernParam& ncb_param, + const NCBKernIndex& ncb_index) mutable { + MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, + midout_iv("filter_preprocess"_hash)) { + bundle.set(ncb_param.workspace_ptr); + filter_preprocess(strategy, bundle, preprocessed_dst, + ncb_param, ncb_index); + } + MIDOUT_END(); + }; + size_t oc_parallelism = OC; + if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { + megdnn_assert(OC % 8 == 0); + oc_parallelism = OC / 8; + } else if (param.filter_meta.format == + param::ConvBias::Format::NCHW44) { + megdnn_assert(OC % 4 == 0); + oc_parallelism = OC / 4; + } + kerns.push_back({filter_process_kern, {GROUP, oc_parallelism}}); + return kerns; + } + SmallVector get_kerns( const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase* matmul_algo) { @@ -386,7 +540,6 @@ public: static_cast(matmul_param) = get_matmul_kern_param(param, m_unit_oc_size); - Strategy strategy = m_strategy; size_t unit_tile_size = m_unit_tile_size; size_t unit_oc_size = m_unit_oc_size; size_t units_h = div_ceil(OH, Strategy::OUTPUT_BLOCK_SIZE); @@ -411,20 +564,22 @@ public: param::ConvBias::Format::NCHW44_WINOGRAD)); SmallVector kerns; - if (param.filter_meta.format == param::ConvBias::Format::NCHW || - param.filter_meta.format == param::ConvBias::Format::NCHW88 || - param.filter_meta.format == param::ConvBias::Format::NCHW44) { - //! probably a gcc bug, labmda require capturing 'this' to call - //! static member function + if (param.preprocessed_filter == nullptr && + (param.filter_meta.format == param::ConvBias::Format::NCHW || + param.filter_meta.format == param::ConvBias::Format::NCHW88 || + param.filter_meta.format == param::ConvBias::Format::NCHW44)) { auto filter_process_kern = - [this, strategy, bundle_top, bundle_compute]( + [strategy = m_strategy, bundle_top, bundle_compute]( const NCBKernParam& ncb_param, const NCBKernIndex& ncb_index) mutable { - MEGDNN_MARK_USED_VAR(this); - bundle_top.set(ncb_param.workspace_ptr); - bundle_compute.set(bundle_top.get(0)); - filter_process(strategy, bundle_top, bundle_compute, - ncb_param, std::move(ncb_index)); + MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, + midout_iv("filter_process"_hash)) { + bundle_top.set(ncb_param.workspace_ptr); + bundle_compute.set(bundle_top.get(0)); + filter_process(strategy, bundle_top, bundle_compute, + ncb_param, std::move(ncb_index)); + } + MIDOUT_END(); }; size_t oc_parallelism = OC; if (param.filter_meta.format == param::ConvBias::Format::NCHW88) { @@ -438,12 +593,12 @@ public: kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}}); } auto winograd_compute_kern = - [strategy, bundle_top, bundle_compute, matmul_algo, + [strategy = m_strategy, bundle_top, bundle_compute, matmul_algo, matmul_param, unit_tile_size, unit_oc_size](const NCBKernParam& ncb_param, const NCBKernIndex& ncb_index) mutable { - MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, 0, - 0) { + MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common, + midout_iv("winograd_compute"_hash)) { bundle_top.set(ncb_param.workspace_ptr); bundle_compute.set(bundle_top.get(0)); winograd_compute(strategy, bundle_top, bundle_compute, @@ -562,4 +717,54 @@ public: filter_dtype(filter_dtype), \ dst_dtype(dst_dtype) {} +#define MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, _fun, _strategy, \ + _midout_flag, _matmul_format) \ + MEGDNN_MARK_USED_VAR(param); \ + MIDOUT_BEGIN(_midout_flag, midout_iv(#_class #_fun##_hash)) { \ + _strategy strategy(param.src_type, param.filter_type, param.dst_type); \ + return megdnn::winograd::ConvBias<_strategy, _matmul_format>( \ + strategy, m_tile_size, param) \ + ._fun(param, m_matmul_algo); \ + } \ + MIDOUT_END(); + +#define MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(_class, _strategy, _midout_flag, \ + _matmul_format) \ + size_t ConvBiasImpl::_class::get_workspace( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { \ + MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_workspace_size, \ + _strategy, _midout_flag, \ + _matmul_format); \ + return 0; \ + } \ + size_t ConvBiasImpl::_class::get_preprocess_workspace( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { \ + MEGDNN_WINOGRADS_ALGO_FUN_DEFINE( \ + _class, get_preprocess_workspace_size, _strategy, \ + _midout_flag, _matmul_format); \ + return 0; \ + } \ + SmallVector \ + ConvBiasImpl::_class::deduce_preprocessed_filter_layout( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { \ + MEGDNN_WINOGRADS_ALGO_FUN_DEFINE( \ + _class, deduce_preprocessed_filter_layout, _strategy, \ + _midout_flag, _matmul_format); \ + return {}; \ + } \ + SmallVector \ + ConvBiasImpl::_class::dispatch_preprocess_kerns( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { \ + MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_preprocess_kerns, \ + _strategy, _midout_flag, \ + _matmul_format); \ + return {}; \ + } \ + SmallVector ConvBiasImpl::_class::dispatch_kerns( \ + fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { \ + MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_kerns, _strategy, \ + _midout_flag, _matmul_format); \ + return {}; \ + } + // vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/conv_bias/f32/algos.h b/dnn/src/x86/conv_bias/f32/algos.h index 144f5713..5ed6e051 100644 --- a/dnn/src/x86/conv_bias/f32/algos.h +++ b/dnn/src/x86/conv_bias/f32/algos.h @@ -94,7 +94,6 @@ public: AlgoFP32WinogradF63_8x8(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -102,19 +101,8 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; void* type() const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; class ConvBiasImpl::AlgoFP32WinogradF23_8x8 final : public AlgoBase { @@ -122,7 +110,6 @@ public: AlgoFP32WinogradF23_8x8(fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} - bool is_reproducible() const override { return true; } const char* name() const override { if (m_name.empty()) { m_name = ConvBiasImpl::algo_name( @@ -130,19 +117,8 @@ public: } return m_name.c_str(); } - bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, - AlgoSelectionStrategy algo_selection_strategy) const override; - size_t get_workspace(fallback::ConvBiasImpl*, - const NCBKernSizeParam& param) const override; - virtual SmallVector dispatch_kerns( - fallback::ConvBiasImpl* opr, - const NCBKernSizeParam& param) const override; void* type() const override; - -private: - fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; - mutable std::string m_name; - uint32_t m_tile_size; + MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(); }; /* ===================== matmul algo ===================== */ diff --git a/dnn/src/x86/conv_bias/f32/winograd_algo.cpp b/dnn/src/x86/conv_bias/f32/winograd_algo.cpp index fcd95642..0a54da96 100644 --- a/dnn/src/x86/conv_bias/f32/winograd_algo.cpp +++ b/dnn/src/x86/conv_bias/f32/winograd_algo.cpp @@ -41,8 +41,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_8x8::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW88 || @@ -67,39 +66,10 @@ bool ConvBiasImpl::AlgoFP32WinogradF63_8x8::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF63_8x8::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_x86_winograd_fp32, 1, 1) { - winograd::winograd_nchw88_6x3_8x8_f strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} - -SmallVector -ConvBiasImpl::AlgoFP32WinogradF63_8x8::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 2) { - winograd::winograd_nchw88_6x3_8x8_f strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_8x8, + winograd::winograd_nchw88_6x3_8x8_f, + megdnn_x86_winograd_fp32, + param::MatrixMul::Format::MK8); /* ======================= AlgoFP32WinogradF23_8*8 ======================== */ @@ -118,8 +88,7 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_8x8::usable( auto&& matmul_param = megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) + strategy, m_tile_size, param) .get_matmul_kern_param(param); return m_matmul_algo->usable(matmul_param) && (opr->param().format == param::ConvBias::Format::NCHW88 || @@ -144,37 +113,9 @@ bool ConvBiasImpl::AlgoFP32WinogradF23_8x8::usable( return false; } -size_t ConvBiasImpl::AlgoFP32WinogradF23_8x8::get_workspace( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_x86_winograd_fp32, 2, 1) { - winograd::winograd_nchw88_2x3_8x8_f strategy( - param.src_type, param.filter_type, param.dst_type); - return megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg) - .get_workspace_size(param, m_matmul_algo); - } - MIDOUT_END(); - return 0; -} +MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_8x8, + winograd::winograd_nchw88_2x3_8x8_f, + megdnn_x86_winograd_fp32, + param::MatrixMul::Format::MK8); -SmallVector -ConvBiasImpl::AlgoFP32WinogradF23_8x8::dispatch_kerns( - fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { - MEGDNN_MARK_USED_VAR(param); - MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 2) { - winograd::winograd_nchw88_2x3_8x8_f strategy( - param.src_type, param.filter_type, param.dst_type); - auto winograd_impl = - megdnn::winograd::ConvBias( - strategy, m_tile_size, param.nr_threads, param.osz[0], - param.osz[1], param.filter_meta.ocpg); - return winograd_impl.get_kerns(param, m_matmul_algo); - } - MIDOUT_END(); - return {}; -} // vim: syntax=cpp.doxygen diff --git a/dnn/test/arm_common/conv_bias.cpp b/dnn/test/arm_common/conv_bias.cpp index f89bbe23..eb91933b 100644 --- a/dnn/test/arm_common/conv_bias.cpp +++ b/dnn/test/arm_common/conv_bias.cpp @@ -57,6 +57,23 @@ TEST_F(ARM_COMMON, CONV_BIAS_MATMUL) { } } +TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(); + Checker checker(handle()); + + check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); +} + +TEST_F(ARM_COMMON, CONV_BIAS_WINOGRAD_F63_4_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(); + Checker> checker( + handle()); + + check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); +} + #define CONV_BIAS_MATMUL_QU8_MODE(MODE) \ using namespace conv_bias; \ std::vector args = get_quantized_args_with_nlmode(MODE); \ diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp index f64f01c5..7f5f9907 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp @@ -783,6 +783,14 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) { check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(); + Checker> checker( + handle()); + check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_NCHW44) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({3}, 1); @@ -791,6 +799,16 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_NCHW44) { param::ConvBias::Format::NCHW44); } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_F23_4_NCHW44_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_nchw44_conv_bias_args({3}, 1); + Checker> checker( + handle()); + check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4, + param::ConvBias::Format::NCHW44); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { using namespace conv_bias; std::vector args = get_winograd_args(3); @@ -799,6 +817,14 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { check_winograd("1:6:32", checker, args); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(3); + Checker> checker( + handle()); + check_winograd("1:6:32", checker, args); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4) { using namespace conv_bias; std::vector args = get_winograd_mk_packed_args(); @@ -807,6 +833,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4) { check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(); + Checker> checker( + handle()); + + check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44) { using namespace conv_bias; std::vector args = get_nchw44_conv_bias_args({3}, 1); @@ -815,6 +850,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44) { param::ConvBias::Format::NCHW44); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_nchw44_conv_bias_args({3}, 1); + Checker> checker( + handle()); + check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4, + param::ConvBias::Format::NCHW44); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { using namespace conv_bias; std::vector args = get_winograd_args(4); @@ -823,6 +867,14 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { check_winograd("1:5:32", checker, args); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(4); + Checker> checker( + handle()); + check_winograd("1:5:32", checker, args); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F45) { using namespace conv_bias; std::vector args = get_winograd_args(5); @@ -831,6 +883,14 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F45) { check_winograd("1:4:32", checker, args); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F45_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(5); + Checker> checker( + handle()); + check_winograd("1:4:32", checker, args); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { using namespace conv_bias; std::vector args = get_winograd_args(3); @@ -1007,6 +1067,39 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { 1e-3f); } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_MK_PACKED_F32_1_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + std::vector args = get_winograd_mk_packed_args(8); + std::vector args_first_half(args.begin(), + args.begin() + args.size() / 2); + run(handle(), args_first_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, + dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, + 1e-3f); +} + TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_2) { using namespace conv_bias; @@ -1038,6 +1131,38 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_2) { 1e-3f); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_2_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + std::vector args = get_winograd_mk_packed_args(8); + std::vector args_second_half(args.begin() + args.size() / 2, + args.end()); + run(handle(), args_second_half, {2, 6}, dtype::Float32{}, dtype::Float32{}, + dtype::Float32{}, dtype::Float32{}, param::MatrixMul::Format::MK4, + 1e-3f); +} + #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F16) { using namespace conv_bias; @@ -1070,6 +1195,40 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F16) { dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8, 0.25); } + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_MK_PACKED_F16_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + + std::vector args = get_winograd_mk_packed_args(8); + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng); + run(handle(), args, {2}, dtype::Float16{}, dtype::Float16{}, + dtype::Float16{}, dtype::Float16{}, param::MatrixMul::Format::MK8, + 0.25); +} #endif TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { using namespace conv_bias; @@ -1281,6 +1440,223 @@ TEST_F(ARM_COMMON_MULTI_THREADS, epsilon); } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_MK_PACKED_INT8_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; +#else + const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD:%s:8:2:32", matmul_name).c_str())); + + std::vector quantized_args = + get_quantized_winograd_mk_packed_args(8); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), + dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; +#else + const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); + + std::vector quantized_args = get_int8_nchw44_args(3, 4); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), + dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; +#else + const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); + + std::vector quantized_args = + get_int8_nchw44_args(3, 4, false, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), + dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_F32_MK4_4x16"; +#else + const char* matmul_name = "ARMV7_F32_MK4_4x8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); + std::vector quantized_args = get_int8_nchw44_args(3, 4, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), + dtype::QuantizedS8(0.01887994f), + dtype::QuantizedS32(0.41113496f * 0.01887994f), + dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, + epsilon); +} + +TEST_F(ARM_COMMON_MULTI_THREADS, + WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE_WEIGHT_PREPROCESS) { + using namespace conv_bias; + + Checker> checker( + handle()); + auto run = [&checker](Handle* handle, const std::vector& args, + const std::vector& out_size, DType A_dtype, + DType B_dtype, DType C_dtype, DType D_dtype, + param::MatrixMul::Format format, float eps) { + for (auto&& arg : args) { + for (uint32_t m : out_size) { + checker.set_extra_opr_impl(std::bind( + winograd_algo_extra_impl, std::placeholders::_1, m, + arg.param, handle, format)); + checker.set_dtype(0, A_dtype) + .set_dtype(1, B_dtype) + .set_dtype(2, C_dtype) + .set_dtype(4, D_dtype) + .set_epsilon(eps) + .set_param(arg.param) + .execs({arg.src, arg.filter, arg.bias, {}, {}}); + } + } + }; + + float epsilon = 0.001; +#if MEGDNN_AARCH64 + const char* matmul_name = "AARCH64_F32_MK4_4x16"; +#else + const char* matmul_name = "ARMV7_F32_MK4_4x8"; +#endif + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); + std::vector quantized_args = + get_int8_nchw44_args(3, 4, true, true); + UniformIntRNG int_rng{-50, 50}; + checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); + run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), + dtype::QuantizedS8(0.01887994f), + dtype::QuantizedS32(0.41113496f * 0.01887994f), + dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, + epsilon); +} + #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) { using namespace conv_bias; @@ -1338,6 +1714,72 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_8x8_2) { check_winograd_fp16("8:2:32", checker, args_back_half, rng, 0.25, param::MatrixMul::Format::MK8); } +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(); + Checker> checker( + handle()); + check_winograd_fp16("1:2:32", checker, args, NULL, 0.08); +} +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_F16_F45_1_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(5); + std::vector args_head_half(args.begin(), + args.begin() + args.size() / 2); + Checker> checker( + handle()); + //! fp16 range -1.0 ~ 1.0 + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + check_winograd_fp16("1:4:32", checker, args_head_half, rng, 0.25); +} +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_F16_F45_2_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(5); + std::vector args_back_half(args.begin() + args.size() / 2, + args.end()); + Checker> checker( + handle()); + //! fp16 range -1.0 ~ 1.0 + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + check_winograd_fp16("1:4:32", checker, args_back_half, rng, 0.25); +} +//! FIXME: This test may be failed if run `ARM_COMMON.CONV_BIAS_WINOGRAD*`, but +//! it will pass when run single testcase +TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F63_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_args(3); + Checker> checker( + handle()); + //! fp16 range -1.0 ~ 1.0 + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + check_winograd_fp16("1:6:32", checker, args, rng, 0.3); +} +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_F16_8x8_1_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(8); + std::vector args_head_half(args.begin(), + args.begin() + args.size() / 2); + Checker> checker( + handle()); + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + check_winograd_fp16("8:2:32", checker, args_head_half, rng, 0.25, + param::MatrixMul::Format::MK8); +} +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_F16_8x8_2_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_packed_args(8); + std::vector args_back_half(args.begin() + args.size() / 2, + args.end()); + Checker> checker( + handle()); + Float16PeriodicalRNG* rng = new Float16PeriodicalRNG(0x3c00); + check_winograd_fp16("8:2:32", checker, args_back_half, rng, 0.25, + param::MatrixMul::Format::MK8); +} #endif TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) { using namespace conv_bias; @@ -1354,6 +1796,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_INT8_8X8) { check_winograd("8:2:32", checker, args, param::MatrixMul::Format::MK8); } +TEST_F(ARM_COMMON_MULTI_THREADS, + CONV_BIAS_WINOGRAD_INT8_8X8_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_quantized_winograd_mk_packed_args(8); + Checker> checker( + handle()); + UniformIntRNG rng{-50, 50}; + checker.set_dtype(0, dtype::QuantizedS8(2.5f)) + .set_dtype(1, dtype::QuantizedS8(2.5f)) + .set_dtype(2, dtype::QuantizedS32(6.25f)) + .set_dtype(4, dtype::QuantizedS8(60.25f)) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_rng(2, &rng); + + check_winograd("8:2:32", checker, args, param::MatrixMul::Format::MK8); +} void checker_conv_bias(std::vector args, Handle* handle, RNG* rng, float epsilon, DType type0, DType type1, diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index 8d70e91e..cbdb9d2b 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -1364,7 +1364,8 @@ std::vector get_winograd_mk_nchw88_args() { TensorShape{oc, ic, 3, 3, 8, 8},TensorShape{}); //! bias args.emplace_back(cur_param, TensorShape{2, ic, i, i, 8}, - TensorShape{oc, ic, 3, 3, 8, 8}, TensorShape{2, oc, i, i, 8}); + TensorShape{oc, ic, 3, 3, 8, 8}, + TensorShape{2, oc, i, i, 8}); /*cur_param.sparse = param::ConvBias::Sparse::GROUP; args.emplace_back(cur_param, TensorShape{2, 2 * ic, i, i, 8}, @@ -1401,6 +1402,21 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63) { } } +TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F63_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_nchw88_args(); + Checker> checker( + handle()); + + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD:X86_F32MK8_8X8:8:6").c_str())); + + for (auto&& arg : args) { + checker.set_param(arg.param).execs( + {arg.src, arg.filter, arg.bias, {}, {}}); + } +} + TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) { using namespace conv_bias; std::vector args = get_winograd_mk_nchw88_args(); @@ -1415,6 +1431,21 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23) { } } +TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW88_F23_WEIGHT_PREPROCESS) { + using namespace conv_bias; + std::vector args = get_winograd_mk_nchw88_args(); + Checker> checker( + handle()); + + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker( + ssprintf("WINOGRAD:X86_F32MK8_8X8:8:2").c_str())); + + for (auto&& arg : args) { + checker.set_param(arg.param).execs( + {arg.src, arg.filter, arg.bias, {}, {}}); + } +} + TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) { using namespace conv_bias; std::vector args = get_winograd_mk_nchw88_args(); -- GitLab