diff --git a/dnn/src/fallback/conv_bias/gi/fp32/algos.h b/dnn/src/fallback/conv_bias/gi/fp32/algos.h index 1de3569fbd0cb2d8101d4d3a3bd46f610ae3699a..3f2ee46ca25710d5f2636b72ccadcff09b2530bb 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/algos.h +++ b/dnn/src/fallback/conv_bias/gi/fp32/algos.h @@ -1,5 +1,6 @@ #pragma once +#include "src/common/opr_delegate.h" #include "src/fallback/conv_bias/opr_impl.h" #include "src/fallback/matrix_mul/opr_impl.h" @@ -249,6 +250,26 @@ public: MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_FP32) }; +class ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT final : public AlgoBase { + SmallVector get_kimpls(const NCBKernSizeParam& param) const; + +public: + AlgoF32DirectNCHWNCHW44AGENT(){}; + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } + const char* name() const override { return "F32_CONV_AGENT_NCHW_NCHW44"; } + bool usable( + const NCBKernSizeParam& param, + AlgoSelectionStrategy algo_selection_strategy) const override; + + size_t get_workspace(const NCBKernSizeParam& param) const override; + virtual SmallVector dispatch_kerns( + const NCBKernSizeParam& param) const override; + ConvAlgoTypePack get_algo_type() const override { + return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT}; + } + MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32) +}; + class ConvBiasImpl::AlgoF32ChannelWiseNCHW44 final : public AlgoBase { SmallVector get_kimpls(const NCBKernSizeParam& param) const; diff --git a/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_agent_algo.cpp b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_agent_algo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7dcae62f1976867cab00c7c8c62b7d65ebfa9591 --- /dev/null +++ b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_agent_algo.cpp @@ -0,0 +1,256 @@ +#include "megdnn/opr_param_defs.h" +#include "megdnn/oprs.h" +#include "src/common/nchw_nchwxx_valid.h" +#include "src/common/opr_delegate.h" +#include "src/fallback/conv_bias/gi/fp32/algos.h" +#include "src/fallback/elemwise_helper/elemwise_op.h" + +#include "midout.h" + +using namespace megdnn; +using namespace fallback; + +MIDOUT_DECL(megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent) + +namespace { + +param::ConvBias get_param_convbias(const ConvBiasImpl::NCBKernSizeParam& p) { + param::ConvBias::Mode mode; + if (p.filter_meta.should_flip) { + mode = param::ConvBias::Mode::CONVOLUTION; + } else { + mode = param::ConvBias::Mode::CROSS_CORRELATION; + } + + return param::ConvBias{ + p.nonlineMode, + mode, + param::ConvBias::Sparse::DENSE, + ConvBias::Param::Format::NCHW, + p.filter_meta.padding[0], + p.filter_meta.padding[1], + p.filter_meta.stride[0], + p.filter_meta.stride[1], + p.filter_meta.dilation[0], + p.filter_meta.dilation[1], + megdnn::param::ConvBias::ComputeMode::DEFAULT}; +} + +TensorLayoutArray get_layouts(const ConvBiasImpl::NCBKernSizeParam& p) { + UNPACK_CONV_NCB_KERN_SIZES(p); + MEGDNN_MARK_USED_VAR(SH); + MEGDNN_MARK_USED_VAR(SW); + MEGDNN_MARK_USED_VAR(PH); + MEGDNN_MARK_USED_VAR(PW); + MEGDNN_MARK_USED_VAR(OW); + MEGDNN_MARK_USED_VAR(OH); + TensorLayout src_layout({N, IC, IH, IW}, p.src_type); + //! 44 filter to chw + TensorLayout filter_layout44({OC / 4, FH, FW, IC, 4}, p.filter_type); + TensorLayout filter_layout_reshape({OC / 4, 4, IC, FH, FW}, p.filter_type); + TensorLayout filter_layout({OC, IC, FH, FW}, p.filter_type); + + TensorLayout bias_layout44{{}, p.bias_type}; + TensorLayout bias_layout{{}, p.bias_type}; + TensorLayout bias_layout_reshape{{}, p.bias_type}; + if (p.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) { + bias_layout44 = TensorLayout({1, OC / 4, 1, 1, 4}, p.bias_type); + bias_layout_reshape = TensorLayout({1, OC / 4, 4, 1, 1}, p.bias_type); + bias_layout = TensorLayout({1, OC, 1, 1}, p.bias_type); + } + //! chw dst to 44 + TensorLayout dst_layout = TensorLayout({N, OC, OH, OW}, p.dst_type); + TensorLayout dst_layout_reshape = TensorLayout({N, OC / 4, 4, OH, OW}, p.dst_type); + TensorLayout dst_layout44 = TensorLayout({N, OC / 4, OH, OW, 4}, p.dst_type); + + return {src_layout, filter_layout, filter_layout44, + bias_layout, bias_layout44, dst_layout, + dst_layout44, filter_layout_reshape, bias_layout_reshape, + dst_layout_reshape}; +} + +static WorkspaceBundle get_bundle( + const ConvBiasImpl::NCBKernSizeParam& param, + const std::unique_ptr& conv_bias_op) { + auto layouts = get_layouts(param); + auto src_layout = layouts[0]; + auto filter_layout = layouts[1]; + auto bias_layout = layouts[3]; + auto dst_layout = layouts[5]; + size_t weight_relayout_workspace = filter_layout.span().dist_byte(); + size_t bias_relayout_workspace = bias_layout.span().dist_byte(); + conv_bias_op->param() = get_param_convbias(param); + auto dummy = TensorLayout(); + auto conv_workspace = conv_bias_op->get_workspace_in_bytes( + src_layout, filter_layout, bias_layout, dummy, dst_layout, nullptr); + auto conv_dst_workspace = dst_layout.span().dist_byte(); + + return {nullptr, + {weight_relayout_workspace, bias_relayout_workspace, conv_workspace, + conv_dst_workspace}}; +}; +}; // namespace + +namespace { +inline bool is_usable( + const DTypeEnum src_dtype, const DTypeEnum filter_dtype, + const DTypeEnum dst_dtype, + const ConvolutionBase::CanonizedFilterMeta& fm, + const BiasMode bias_mode, const param::ConvBias::NonlineMode nonline_mode) { + bool ok_type = + ((src_dtype == DTypeEnum::Float32 && filter_dtype == DTypeEnum::Float32 && + (dst_dtype == DTypeEnum::Float32))) && + (fm.format == param::Convolution::Format::NCHW44); + bool ok_nonline = nonline_mode == param::ConvBias::NonlineMode::IDENTITY || + nonline_mode == param::ConvBias::NonlineMode::RELU || + nonline_mode == param::ConvBias::NonlineMode::SIGMOID || + nonline_mode == param::ConvBias::NonlineMode::H_SWISH; + bool ok_src_dst = + fm.icpg < 4 && (fm.ocpg % 4 == 0 && fm.ocpg >= 4) && fm.group == 1; + + bool ok_filter = fm.spatial_ndim == 2 && fm.spatial[0] == fm.spatial[1] && + (fm.spatial[0] == 2 || fm.spatial[0] == 3 || fm.spatial[0] == 5 || + fm.spatial[0] == 7); + bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 && + fm.stride[0] == fm.stride[1] && + (fm.stride[0] == 1 || fm.stride[1] == 2); + bool ok_conv = !fm.should_flip && bias_mode != BiasMode::BIAS; + bool avaible = + ok_type && ok_nonline && ok_src_dst && ok_filter && ok_slide && ok_conv; + return avaible; +} +}; // namespace + +bool ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::usable( + const NCBKernSizeParam& param, AlgoSelectionStrategy) const { + return is_usable( + param.src_type.enumv(), param.filter_type.enumv(), param.dst_type.enumv(), + param.filter_meta, param.bias_mode, param.nonlineMode); +} + +size_t ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::get_workspace( + const NCBKernSizeParam& param) const { + MIDOUT_BEGIN( + megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent, + midout_iv("AlgoF32DirectNCHWNCHW44AGENT::get_workspace"_hash)) { + auto conv_bias_op = param.handle->create_operator(); + return get_bundle(param, conv_bias_op).total_size_in_bytes(); + } + MIDOUT_END(); + return 0; +} + +SmallVector ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT:: + dispatch_kerns(const NCBKernSizeParam& k_param) const { + SmallVector ret_kerns; + + MIDOUT_BEGIN( + megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent, + midout_iv("AlgoF32DirectNCHWNCHW44AGENT::dispatch_kerns"_hash)) { + auto filter_and_bias_dimshuffle = [](const NCBKernParam& kern_param, + const NCBKernIndex&) { + auto layouts = get_layouts(kern_param); + auto filter_layout_44 = layouts[2]; + auto bias_layout44 = layouts[4]; + auto filter_layout_reshape = layouts[7]; + auto bias_layout_reshape = layouts[8]; + + auto conv_bias_op = kern_param.handle->create_operator(); + auto bundle = get_bundle(kern_param, conv_bias_op); + bundle.set(kern_param.workspace_ptr); + auto weight_ws = bundle.get(0); + auto bias_ws = bundle.get(1); + + //! relayout bias and weight + TensorND chw_weight_t = TensorND(weight_ws, filter_layout_reshape); + TensorND weight44_t = TensorND( + kern_param.filter_ptr.get_ptr(), + filter_layout_44.dimshuffle({0, 4, 3, 1, 2})); + auto relayout_op = inplace_cpu_handle()->create_operator(); + relayout_op->exec(weight44_t, chw_weight_t); + + TensorND chw_bias_t = TensorND(bias_ws, bias_layout_reshape); + if (bias_layout44.ndim != 0) { + TensorND bias44_t = TensorND( + kern_param.bias_ptr.get_ptr(), + bias_layout44.dimshuffle({0, 1, 4, 2, 3})); + relayout_op->exec(bias44_t, chw_bias_t); + } + }; + ret_kerns.push_back({filter_and_bias_dimshuffle, {1}}); + + auto do_agent_conv = [&ret_kerns, &k_param]() { + auto layouts = get_layouts(k_param); + auto src_layout = layouts[0]; + auto filter_layout = layouts[1]; + auto bias_layout = layouts[3]; + auto dst_layout = layouts[5]; + + //! do chw conv + auto conv_bias_op = k_param.handle->create_operator(); + conv_bias_op->param() = get_param_convbias(k_param); + auto dummy_z = TensorND(); + auto&& conv_bias_algo = + static_cast(conv_bias_op.get()) + ->get_algorithm_heuristic( + src_layout, filter_layout, bias_layout, + dummy_z.layout, dst_layout, + std::numeric_limits::max(), + AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT); + auto new_param = k_param; + new_param.filter_meta.format = ConvBias::Param::Format::NCHW; + auto&& conv_bias_kerns = + static_cast(conv_bias_algo)->dispatch_kerns(new_param); + for (size_t i = 0; i < conv_bias_kerns.size(); i++) { + auto&& kernel = conv_bias_kerns[i]; + auto run = [kernel]( + const NCBKernParam& p, + const NCBKernIndex& ncb_index) { + auto conv_bias_op = p.handle->create_operator(); + auto bundle = get_bundle(p, conv_bias_op); + bundle.set(p.workspace_ptr); + auto weight_ws = bundle.get(0); + auto bias_ws = bundle.get(1); + auto chw_conv_ws = bundle.get(2); + auto chw_conv_ws_size = bundle.get_size(2); + auto chw_conv_dst_ws = bundle.get(3); + + auto param = p; + param.filter_ptr = weight_ws; + param.bias_ptr = bias_ws; + param.dst_ptr = chw_conv_dst_ws; + param.workspace_ptr = chw_conv_ws; + param.workspace_size = chw_conv_ws_size; + kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id}); + }; + ret_kerns.push_back({run, kernel.global_size}); + } + }; + do_agent_conv(); + + auto dest_dimshuffle = [](const NCBKernParam& kern_param, const NCBKernIndex&) { + auto param = kern_param; + auto layouts = get_layouts(param); + auto dst_layout44 = layouts[6]; + auto dst_layout_reshape = layouts[9]; + + auto conv_bias_op = kern_param.handle->create_operator(); + auto bundle = get_bundle(kern_param, conv_bias_op); + bundle.set(kern_param.workspace_ptr); + auto chw_conv_dst_ws = bundle.get(3); + + //! relayout dst to dst44 tensor + TensorND chw44_dst_t = TensorND(kern_param.dst_ptr.get_ptr(), dst_layout44); + TensorND chw_dst_t = TensorND(chw_conv_dst_ws, dst_layout_reshape); + auto relayout_op = inplace_cpu_handle()->create_operator(); + relayout_op->exec( + {chw_conv_dst_ws, dst_layout_reshape.dimshuffle({0, 1, 3, 4, 2})}, + chw44_dst_t); + }; + ret_kerns.push_back({dest_dimshuffle, {1}}); + return ret_kerns; + } + MIDOUT_END(); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp index 7fc33115f1b192dd368a7bdc8e5886c2f5d17c97..7dab5bf848e8060861bb854c7153b903c9143b18 100644 --- a/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp +++ b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp @@ -1,16 +1,3 @@ -/** - * \file - dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. - */ - #include "megdnn/oprs.h" #include "src/common/nchw_nchwxx_valid.h" #include "src/common/opr_delegate.h" diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 145b285b659063cd2399deb31d4d99180ebef539..897dbc95a792ef0d5233f958bf82e608035dba17 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -84,7 +84,8 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { AlgoBase::Mapper m_all_algos_map; SmallVector m_gi_winograd_algos; - AlgoF32DirectNCHWNCHW44 f32_direct_stride2_nchw_nchw44; + AlgoF32DirectNCHWNCHW44 f32_nchw_nchw44; + AlgoF32DirectNCHWNCHW44AGENT f32_nchw_nchw44_agent; AlgoF32ChannelWiseNCHW44 f32_chanel_wise_nchw44; AlgoF32DirectNCHW44 f32_direct_nchw44; @@ -94,8 +95,17 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { public: AlgoPack() { - // fallback gi fp32 algo - m_all_algos.emplace_back(&f32_direct_stride2_nchw_nchw44); + //! fallback gi fp32 algo + //! now f32_nchw_nchw44_agent is fast than f32_nchw_nchw44 + //! on x86 and rvv platform, so we adjust heuristic order. +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + m_all_algos.emplace_back(&f32_nchw_nchw44); + m_all_algos.emplace_back(&f32_nchw_nchw44_agent); +#else + m_all_algos.emplace_back(&f32_nchw_nchw44_agent); + m_all_algos.emplace_back(&f32_nchw_nchw44); +#endif + m_all_algos.emplace_back(&f32_chanel_wise_nchw44); m_all_algos.emplace_back(&f32_direct_nchw44); m_all_algos.emplace_back(&f32_direct_stride1); @@ -471,7 +481,8 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( param().compute_mode, nr_threads, reinterpret_cast( - preprocessed_filter)}, + preprocessed_filter), + handle()}, bias.dtype, bias.stride[0], bias_mode, @@ -491,6 +502,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( ret.dst_ptr = dst.get_ref_ptr(); ret.workspace_ptr = workspace.raw_ptr; ret.workspace_size = workspace.size; + ret.handle = handle(); return ret; } diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index c07893adca8bbe94adf3466163ed07df6ac40026..d39ed2d36f1b3946c6e02ebf15636f464584ea1a 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -228,6 +228,7 @@ public: GI_COMMON_DIRECT_STRD2_FP32, GI_COMMON_DIRECT_NCHW44_FP32, GI_COMMON_DIRECT_NCHW_NCHW44_FP32, + GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32, GI_COMMON_CHWNWISE_NCHW44_F32, #if MEGDNN_X86 @@ -389,6 +390,7 @@ private: class AlgoF32DirectStride1; class AlgoF32DirectStride2; class AlgoF32DirectNCHWNCHW44; + class AlgoF32DirectNCHWNCHW44AGENT; class AlgoF32ChannelWiseNCHW44; class AlgoF32DirectNCHW44; diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index 84f2a7ccd9ad9873d7e13634141f0d52fc2a486d..54ec18aa3998e52cf716ed5c16b3e0e0c3beae40 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -242,7 +242,8 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, param().compute_mode, nr_threads, - preprocessed_filter}; + preprocessed_filter, + handle()}; } ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( diff --git a/dnn/src/fallback/convolution/opr_impl.h b/dnn/src/fallback/convolution/opr_impl.h index f456880481da2ac086ba74e5f7b2c8b2e9a7c4b3..b5c1613c2500fd0062f19bfaef04f96c581da3ba 100644 --- a/dnn/src/fallback/convolution/opr_impl.h +++ b/dnn/src/fallback/convolution/opr_impl.h @@ -101,6 +101,7 @@ public: const PreprocessedFilter* preprocessed_filter; //! get the data type category of the param for select the algo AlgoDataType deduce_algo_data_type() const; + Handle* handle; }; //! memory param for kernels with non-contiguous batch diff --git a/dnn/test/fallback/conv_bias.cpp b/dnn/test/fallback/conv_bias.cpp index dd9e2e491401a508039c21bf2c3bbfb1322c4ca5..3f6eab53b61c823aa07a01d355624af46e2a5195 100644 --- a/dnn/test/fallback/conv_bias.cpp +++ b/dnn/test/fallback/conv_bias.cpp @@ -354,6 +354,31 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) { handle(), "F32_CONV_NCHW_NCHW44"); } +#define CB(_MODE, _SUFFIX) \ + TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2_AGENT_##_SUFFIX) { \ + check_conv_bias( \ + conv_bias::get_nchw44_conv_bias_args( \ + {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, {_MODE}, 2, false, true), \ + handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \ + } +CB(megdnn::BiasMode::NO_BIAS, NO_BIAS); +CB(megdnn::BiasMode::BROADCAST_CHANNEL_BIAS, BROADCAST_CHANNEL_BIAS); +#undef CB + +#define CB(_MODE, _SUFFIX) \ + TEST_F(FALLBACK_MULTI_THREADS, \ + CONVBIAS_GI_NCHW_NCHW44_F32_S1_AGENT_IDENTITY_##_SUFFIX) { \ + check_conv_bias( \ + conv_bias::get_nchw44_conv_bias_args( \ + {2, 3, 5, 7}, {_MODE}, ONLY_BR_BIASMODE, 1, false, true), \ + handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \ + } +CB(param::ConvBias::NonlineMode::IDENTITY, IDENTITY); +CB(param::ConvBias::NonlineMode::RELU, RELU); +CB(param::ConvBias::NonlineMode::H_SWISH, H_SWISH); +CB(param::ConvBias::NonlineMode::SIGMOID, SIGMOID); +#undef CB + std::vector get_nchw44_channel_wise_args( std::vector kernel, size_t stride, bool no_bias, bool no_nonlinemode, bool no_full_bias) {