From 91aaafd587a2b8c3c300234bdf004848e59288bc Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 11 May 2022 14:37:41 +0800 Subject: [PATCH] feat(fallback): move arm_common pooling f32 algo to fallback gi GitOrigin-RevId: 1bddd6dc2c8219a85b61badabb66015969f2ae7f --- dnn/src/arm_common/pooling/algo.h | 17 +- dnn/src/arm_common/pooling/opr_impl.cpp | 36 -- dnn/src/arm_common/pooling/opr_impl.h | 46 +- dnn/src/fallback/general_intrinsic/gi_float.h | 32 + dnn/src/fallback/gi_intrinsic_helper.h | 126 ++++ dnn/src/fallback/pooling/gi/algo.cpp | 403 ++++++++++++ dnn/src/fallback/pooling/gi/algo.h | 103 ++++ .../pooling/gi}/algo_fp32_pooling_nchw44.cpp | 18 +- .../gi/do_max_pooling_3x3_s2x2_float.cpp | 157 +++++ .../gi/do_max_pooling_3x3_s2x2_float.h | 26 + .../pooling/gi/do_max_pooling_w4x4_s2x2.cpp | 89 +++ .../pooling/gi/do_max_pooling_w4x4_s2x2.h | 24 + .../pooling/gi/kern_fp32_pooling_nchw44.h | 306 ++++++++++ dnn/src/fallback/pooling/gi/pooling_helper.h | 572 ++++++++++++++++++ dnn/src/fallback/pooling/opr_impl.cpp | 186 +++++- dnn/src/fallback/pooling/opr_impl.h | 135 ++++- dnn/src/x86/pooling/algo.h | 4 +- dnn/test/fallback/gi.cpp | 38 ++ dnn/test/fallback/pooling.cpp | 560 +++++++++++++++++ 19 files changed, 2763 insertions(+), 115 deletions(-) create mode 100644 dnn/src/fallback/gi_intrinsic_helper.h create mode 100644 dnn/src/fallback/pooling/gi/algo.cpp create mode 100644 dnn/src/fallback/pooling/gi/algo.h rename dnn/src/{arm_common/pooling => fallback/pooling/gi}/algo_fp32_pooling_nchw44.cpp (93%) create mode 100644 dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp create mode 100644 dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h create mode 100644 dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp create mode 100644 dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h create mode 100644 dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h create mode 100644 dnn/src/fallback/pooling/gi/pooling_helper.h create mode 100644 dnn/test/fallback/pooling.cpp diff --git a/dnn/src/arm_common/pooling/algo.h b/dnn/src/arm_common/pooling/algo.h index d0bbfa39b..df932d1db 100644 --- a/dnn/src/arm_common/pooling/algo.h +++ b/dnn/src/arm_common/pooling/algo.h @@ -12,7 +12,7 @@ #pragma once #include "src/arm_common/pooling/opr_impl.h" #include "src/arm_common/pooling/pooling_helper.h" -#include "src/common//utils.h" +#include "src/common/utils.h" #include "src/naive/handle.h" namespace megdnn { @@ -134,22 +134,15 @@ public: void exec(const PoolingKernParam& param) const override; MEGDNN_DECL_ALGO_TYPE(ARM_Filter5ModexStridexNCHW44) }; -class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase { -public: - AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; - const char* name() const override { - return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44"; - } - bool usable(const PoolingKernSizeParam& param) const override; - void exec(const PoolingKernParam& param) const override; - MEGDNN_DECL_ALGO_TYPE(ARM_Fp32ModexStridexNCHW44) -}; + class PoolingImpl::AlgoFallback final : public AlgoBase { public: AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; const char* name() const override { return "FALLBACK_POOLING"; } bool usable(const PoolingKernSizeParam&) const override { return true; } - void exec(const PoolingKernParam&) const override {} + void exec(const PoolingKernParam&) const override { + megdnn_assert(false, "code issue happened!!"); + } MEGDNN_DECL_ALGO_TYPE(ARM_Fallback) }; WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param); diff --git a/dnn/src/arm_common/pooling/opr_impl.cpp b/dnn/src/arm_common/pooling/opr_impl.cpp index 4a7c27a79..992b7b372 100644 --- a/dnn/src/arm_common/pooling/opr_impl.cpp +++ b/dnn/src/arm_common/pooling/opr_impl.cpp @@ -32,7 +32,6 @@ private: AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4; AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4; AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4; - AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44; AlgoFallback algo_fallback; public: @@ -49,7 +48,6 @@ public: all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4); all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4); all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4); - all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44); all_algos.emplace_back(&algo_fallback); for (auto&& algo : all_algos) { @@ -62,40 +60,6 @@ public: PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; -PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param( - fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) { - auto safe_u32 = [](size_t v) -> uint32_t { - megdnn_assert( - v <= std::numeric_limits::max(), "value too large: %zu", v); - return v; - }; - return {safe_u32(src.shape[0]), - safe_u32(src.shape[1]), - {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}}, - {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}}, - {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}}, - {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}}, - {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}}, - src.dtype, - dst.dtype, - opr->handle(), - opr->param().format, - opr->param().mode}; -}; - -PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( - fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, - _megdnn_workspace workspace) { - PoolingKernParam ret; - static_cast(ret) = - make_pooling_kern_szie_param(opr, src.layout, dst.layout); - ret.src_ptr = src.get_ref_ptr(); - ret.dst_ptr = dst.get_ref_ptr(); - ret.workspace_ptr = workspace.raw_ptr; - ret.workspace_size = workspace.size; - return ret; -}; - size_t PoolingImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& dst) { TensorLayoutArray layouts{src, dst}; diff --git a/dnn/src/arm_common/pooling/opr_impl.h b/dnn/src/arm_common/pooling/opr_impl.h index 9f2590e15..1715fcd38 100644 --- a/dnn/src/arm_common/pooling/opr_impl.h +++ b/dnn/src/arm_common/pooling/opr_impl.h @@ -19,6 +19,10 @@ namespace arm_common { class PoolingImpl final : public fallback::PoolingImpl { private: + //! TODO: remove + //! AlgoFilterxModexStride1/AlgoFilter2ModexStride2 + //! AlgoFilter3AverageStride2/AlgoFilter4MaxStride2/AlgoFilter5MaxStride2 + //! after imp gi with float16 and int8 support to dnn/src/fallback/pooling/opr_impl.h class AlgoFilterxModexStride1; class AlgoFilter2ModexStride2; class AlgoFilter3MaxStride2; @@ -31,7 +35,6 @@ private: class AlgoFilter3ModexStridexNCHW44; class AlgoFilter4ModexStridexNCHW44; class AlgoFilter5ModexStridexNCHW44; - class AlgoFp32ModexStridexNCHW44; class AlgoFallback; class AlgoPack; static AlgoPack sm_algo_pack; @@ -45,47 +48,10 @@ public: static size_t constexpr MAX_SPATIAL_DIM = 2; - struct PoolingKernSizeParam { - uint32_t n, ic; - std::array isz, osz; - std::array padding, filter, stride; - DType src_type, dst_type; - Handle* handle; - Param::Format format; - Mode mode; - }; - - struct PoolingKernParam : public PoolingKernSizeParam { - RefPtr src_ptr; - RefPtr dst_ptr; - void* workspace_ptr; - size_t workspace_size; - - template - const T* src() const { - src_type.assert_is_compatible_ctype(); - return static_cast(src_ptr.get_ptr()); - } - - template - T* dst() const { - dst_type.assert_is_compatible_ctype(); - return static_cast(dst_ptr.get_ptr()); - } - - template - T* workspace() const { - return static_cast(workspace_ptr); - } - }; + using PoolingKernSizeParam = fallback::PoolingImpl::PoolingKernSizeParam; - PoolingKernSizeParam make_pooling_kern_szie_param( - fallback::PoolingImpl* opr, const TensorLayout& src, - const TensorLayout& dst); + using PoolingKernParam = fallback::PoolingImpl::PoolingKernParam; - PoolingKernParam make_pooling_kern_param( - fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, - _megdnn_workspace workspace); class AlgoBase : public detail::Algorithm { public: enum class AlgoType : uint32_t { diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index e2910fa69..86f818820 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -1325,3 +1325,35 @@ GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) { return ___gi_vget_high_f32(a); #endif } + +GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) { +#if defined(GI_NEON_INTRINSICS) + return vpadd_f32(a, b); +#elif defined(GI_SSE2_INTRINSICS) + float32x2_t res; + res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1]; + res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1]; + return res; +#else + float32x2_t res; + res[0] = a[0] + a[1]; + res[1] = b[0] + b[1]; + return res; +#endif +} + +GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) { +#if defined(GI_NEON_INTRINSICS) + return vpmax_f32(a, b); +#elif defined(GI_SSE2_INTRINSICS) + float32x2_t res; + res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]); + res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]); + return res; +#else + float32x2_t res; + res[0] = MAX_NAN(a[0], a[1]); + res[1] = MAX_NAN(b[0], b[1]); + return res; +#endif +} diff --git a/dnn/src/fallback/gi_intrinsic_helper.h b/dnn/src/fallback/gi_intrinsic_helper.h new file mode 100644 index 000000000..2b97a283f --- /dev/null +++ b/dnn/src/fallback/gi_intrinsic_helper.h @@ -0,0 +1,126 @@ +/** + * \file dnn/src/fallback/gi_intrinsic_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "src/common/unroll_macro.h" +#include "src/fallback/general_intrinsic/gi_float.h" +namespace megdnn { +namespace { + +template < + int weight_number, int base_offset, int ptr_step, int oc_block, typename Func, + typename T, typename T2, typename... XT> +struct LoadHelper { + static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); +}; + +#define WEIGHT_CB(step) \ + src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); + +#define LOAD_HELPER(step) \ + template < \ + int base_offset, int ptr_step, typename Func, typename T, typename T2, \ + typename... XT> \ + struct LoadHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int, XT... args) { \ + UNROLL_CALL_RAW(step, WEIGHT_CB); \ + } \ + } + +LOAD_HELPER(1); +LOAD_HELPER(2); +LOAD_HELPER(3); +LOAD_HELPER(4); +LOAD_HELPER(5); +LOAD_HELPER(6); +LOAD_HELPER(7); +LOAD_HELPER(8); +LOAD_HELPER(9); +LOAD_HELPER(10); +LOAD_HELPER(11); +LOAD_HELPER(12); +LOAD_HELPER(13); +LOAD_HELPER(14); +LOAD_HELPER(15); +LOAD_HELPER(16); + +#undef LOAD_HELPER +#undef WEIGHT_CB + +///////////////////////////c_dim = 1///////////////////////// +#define WEIGHT_CB(step) src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); + +#define LOAD_HELPER(step) \ + template \ + struct LoadHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \ + UNROLL_CALL_RAW(step, WEIGHT_CB); \ + } \ + } + +LOAD_HELPER(1); +LOAD_HELPER(2); +LOAD_HELPER(3); +LOAD_HELPER(4); +LOAD_HELPER(5); +LOAD_HELPER(6); +LOAD_HELPER(7); +LOAD_HELPER(8); +LOAD_HELPER(9); + +#undef LOAD_HELPER +#undef WEIGHT_CB + +/////////////////////////c_dim = 2/////////////////////////////// +#define WEIGHT_CB(step) \ + src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \ + src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset); + +#define LOAD_HELPER(step) \ + template \ + struct LoadHelper { \ + static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \ + UNROLL_CALL_RAW(step, WEIGHT_CB); \ + } \ + } + +LOAD_HELPER(1); +LOAD_HELPER(2); +LOAD_HELPER(3); +LOAD_HELPER(4); +LOAD_HELPER(5); +LOAD_HELPER(6); +LOAD_HELPER(7); +LOAD_HELPER(8); + +#undef LOAD_HELPER +#undef WEIGHT_CB + +template < + int weight_number, int base_offset, int ptr_step, int c_dim, typename Func, + typename T, typename T2> +GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) { + LoadHelper::impl( + weight, ptr, oc_offset); +} + +template < + int weight_number, int base_offset, int ptr_step, int c_dim, typename Func, + typename T, typename T2, typename... XT> +GI_FORCEINLINE void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { + LoadHelper::impl( + weight, ptr, oc_offset, args...); +} + +} // namespace +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/algo.cpp b/dnn/src/fallback/pooling/gi/algo.cpp new file mode 100644 index 000000000..ef93838c3 --- /dev/null +++ b/dnn/src/fallback/pooling/gi/algo.cpp @@ -0,0 +1,403 @@ +/** + * \file dnn/src/fallback/pooling/gi/algo.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "algo.h" +#include "do_max_pooling_w4x4_s2x2.h" +#include "megdnn/opr_param_defs.h" + +#include "midout.h" + +MIDOUT_DECL(megdnn_fallback_gi_pooling) + +namespace megdnn { +namespace fallback { + +WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param) { + megdnn_assert( + param.src_type.category() == DTypeCategory::FLOAT && + param.format == param::Pooling::Format::NCHW && + (param.mode == param::Pooling::Mode::MAX || + (param.mode == param::Pooling::Mode::AVERAGE && param.filter[0] == 3)) && + param.filter[0] == param.filter[1] && + (param.filter[0] == 3 || param.filter[1] == 5) && param.stride[0] == 2 && + param.stride[1] == 2 && param.isz[0] >= 2 && param.isz[1] >= 2); + //! max pooling nxn stride 2 + auto IW = param.isz[1]; + auto OW = param.osz[1]; + + // In order to process odd size filter, + // Firstly, Store a row of the input separately by odd and even numbers + // Then process them, get a row of the outputs + // We need to store n rows of results + SmallVector needed_mem; + for (size_t i = 0; i < param.filter[0]; ++i) + needed_mem.push_back(OW * param.src_type.size()); + needed_mem.push_back((IW + 1) / 2 * param.src_type.size()); + needed_mem.push_back((IW + 1) / 2 * param.src_type.size()); + WorkspaceBundle ws(nullptr, needed_mem, 16); + return ws; +} + +bool PoolingImpl::AlgoGiFilterxModexStride1::usable( + const PoolingKernSizeParam& param) const { + auto SH = param.stride[0]; + auto SW = param.stride[1]; + auto FH = param.filter[0]; + auto FW = param.filter[1]; + + bool avaible = param.src_type.category() == DTypeCategory::FLOAT && + param.format == Param::Format::NCHW && SH == 1 && SW == 1 && + FH == FW && (FH == 2 || FH == 3); + bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE); + return avaible && is_mode_ok; +} + +void PoolingImpl::AlgoGiFilterxModexStride1::exec(const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + auto FH = param.filter[0]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(Pooler, GiPooler, window, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(0), midout_iv(midout_type_id), \ + Pooler::MIDOUT_CASE_NUM, GiPooler::MIDOUT_CASE_NUM, window) { \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ + src_dtype = param.src_type](size_t index, size_t) { \ + size_t n = index / C; \ + size_t c = index % C; \ + do_pooling_compact( \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + \ + n * C * OH * OW + c * OH * OW, \ + src_dtype, IH, IW, OH, OW, PH, PW); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END() + +#define DISPATCH_WINDOW(Pooler, GiPooler, dtype, ctype, comp_type, midout_type_id) \ + switch (FH) { \ + case 2: { \ + using _Pooler = Pooler<4, dtype, ctype, comp_type>; \ + using _GiPooler = GiPooler<4, dtype, ctype, comp_type>; \ + DISPATCH_FUNC(_Pooler, _GiPooler, 2, midout_type_id); \ + break; \ + } \ + case 3: { \ + using _Pooler = Pooler<9, dtype, ctype, comp_type>; \ + using _GiPooler = GiPooler<9, dtype, ctype, comp_type>; \ + DISPATCH_FUNC(_Pooler, _GiPooler, 3, midout_type_id); \ + break; \ + } \ + default: \ + megdnn_assert(0, "unsupport pooling filter size"); \ + break; \ + } + +#define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \ + switch (param.mode) { \ + case Mode::MAX: \ + DISPATCH_WINDOW( \ + MaxPooler, GiMaxPooler, dtype, ctype, comp_type, midout_type_id); \ + break; \ + case Mode::AVERAGE: \ + DISPATCH_WINDOW( \ + MeanInPooler, GiMeanPooler, dtype, ctype, comp_type, \ + midout_type_id); \ + break; \ + default: \ + megdnn_assert(0, "unsupport pooling mode"); \ + break; \ + } + + if (param.src_type == dtype::Float32{}) { + DISPATCH_MODE(dt_float32, float, float, 0); + } +#undef DISPATCH_FUNC +#undef DISPATCH_WINDOW +#undef DISPATCH_MODE +} +bool PoolingImpl::AlgoGiFilter2ModexStride2::usable( + const PoolingKernSizeParam& param) const { + auto SH = param.stride[0]; + auto SW = param.stride[1]; + auto FH = param.filter[0]; + auto FW = param.filter[1]; + + bool avaible = param.src_type.category() == DTypeCategory::FLOAT && + param.format == Param::Format::NCHW && FH == FW && SH == SW && + FH == 2 && SH == 2; + bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE); + return avaible && is_mode_ok; +} + +void PoolingImpl::AlgoGiFilter2ModexStride2::exec(const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; +#define DISPATCH_FUNC(Pooler, mode, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(1), midout_iv(midout_type_id), \ + Pooler::MIDOUT_CASE_NUM) { \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ + src_dtype = param.src_type](size_t index, size_t) { \ + size_t n = index / C; \ + size_t c = index % C; \ + do_pooling_2x2( \ + static_cast(src_ptr.get_ptr()) + \ + n * C * IH * IW + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + \ + n * C * OH * OW + c * OH * OW, \ + src_dtype, IH, IW, OH, OW, PH, PW); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END() + +#define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \ + switch (param.mode) { \ + case Mode::MAX: { \ + using _Pooler = MaxPooler<4, dtype, ctype, comp_type>; \ + DISPATCH_FUNC(_Pooler, Mode::MAX, midout_type_id); \ + break; \ + } \ + case Mode::AVERAGE: { \ + using _Pooler = MeanInPooler<4, dtype, ctype, comp_type>; \ + DISPATCH_FUNC(_Pooler, Mode::AVERAGE, midout_type_id); \ + break; \ + } \ + default: \ + megdnn_assert(0, "unsupport pooling mode"); \ + break; \ + } + + if (param.src_type == dtype::Float32{}) { + DISPATCH_MODE(dt_float32, float, float, 0); + } +#undef DISPATCH_FUNC +#undef DISPATCH_PAD +#undef DISPATCH_MODE +} + +bool PoolingImpl::AlgoGiFilter3MaxStride2::usable( + const PoolingKernSizeParam& param) const { + bool avaible = param.src_type.category() == DTypeCategory::FLOAT && + param.format == Param::Format::NCHW && param.mode == Mode::MAX && + param.filter[0] == 3 && param.filter[1] == 3 && + param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && + param.isz[1] >= 2; + return avaible; +} + +void PoolingImpl::AlgoGiFilter3MaxStride2::exec(const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(type, func, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(2), midout_iv(midout_type_id)) { \ + WorkspaceBundle wbundle = get_bundle(param); \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ + workspace_ptr = param.workspace()]( \ + size_t index, size_t thread_id) { \ + auto ws = wbundle; \ + ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ + size_t n = index / C; \ + size_t c = index % C; \ + do_max_pooling_3x3_s2x2_float_gi( \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END(); + + if (param.src_type == dtype::Float32{}) { + DISPATCH_FUNC(float, float, 0); + } +#undef DISPATCH_FUNC +} +bool PoolingImpl::AlgoGiFilter3AverageStride2::usable( + const PoolingKernSizeParam& param) const { + bool avaible = (param.src_type.category() == DTypeCategory::FLOAT) && + param.format == Param::Format::NCHW && param.mode == Mode::AVERAGE && + param.filter[0] == 3 && param.filter[1] == 3 && + param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && + param.isz[1] >= 2; + return avaible; +} + +void PoolingImpl::AlgoGiFilter3AverageStride2::exec( + const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(type, MEGDNN_SIMD_WIDTH, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(3), midout_iv(midout_type_id)) { \ + WorkspaceBundle wbundle = get_bundle(param); \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ + workspace_ptr = param.workspace()]( \ + size_t index, size_t thread_id) { \ + auto ws = wbundle; \ + ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ + size_t n = index / C; \ + size_t c = index % C; \ + do_average_pooling_3x3_s2x2_gi( \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END(); + if (param.src_type == dtype::Float32{}) { + DISPATCH_FUNC(dt_float32, 4, 0); + } +#undef DISPATCH_FUNC +} +bool PoolingImpl::AlgoGiFilter4MaxStride2::usable( + const PoolingKernSizeParam& param) const { + auto SH = param.stride[0]; + auto SW = param.stride[1]; + auto FH = param.filter[0]; + auto FW = param.filter[1]; + auto OH = param.osz[0], OW = param.osz[1]; + + bool avaible = param.src_type.category() == DTypeCategory::FLOAT && + param.format == Param::Format::NCHW && param.mode == Mode::MAX && + FH == 4 && FW == 4 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2; + return avaible; +} + +void PoolingImpl::AlgoGiFilter4MaxStride2::exec(const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(type, func, midout_type_id) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(4), midout_iv(midout_type_id)) { \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ + src_dtype = param.src_type](size_t index, size_t) { \ + size_t n = index / C; \ + size_t c = index % C; \ + do_max_pooling_w4x4_s2x2_##func##_gi( \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + src_dtype, IH, IW, OH, OW, PH, PW); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END(); + + if (param.src_type == dtype::Float32{}) { + DISPATCH_FUNC(float, float, 0); + } +#undef DISPATCH_FUNC +} +bool PoolingImpl::AlgoGiFilter5MaxStride2::usable( + const PoolingKernSizeParam& param) const { + auto SH = param.stride[0]; + auto SW = param.stride[1]; + auto FH = param.filter[0]; + auto FW = param.filter[1]; + auto OH = param.osz[0], OW = param.osz[1]; + + bool avaible = param.src_type.category() == DTypeCategory::FLOAT && + param.format == Param::Format::NCHW && param.mode == Mode::MAX && + FH == 5 && FW == 5 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2; + return avaible; +} + +void PoolingImpl::AlgoGiFilter5MaxStride2::exec(const PoolingKernParam& param) const { + auto IH = param.isz[0], IW = param.isz[1]; + auto OH = param.osz[0], OW = param.osz[1]; + auto N = param.n, C = param.ic; + auto PH = param.padding[0]; + auto PW = param.padding[1]; + + auto src_ptr = param.src_ptr; + auto dst_ptr = param.dst_ptr; + +#define DISPATCH_FUNC(dtype, type, midout_type_id, MEGDNN_SIMD_WIDTH) \ + MIDOUT_BEGIN( \ + megdnn_fallback_gi_pooling, midout_iv(5), midout_iv(midout_type_id)) { \ + WorkspaceBundle wbundle = get_bundle(param); \ + auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ + workspace_ptr = param.workspace()]( \ + size_t index, size_t thread_id) { \ + auto ws = wbundle; \ + ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ + size_t n = index / C; \ + size_t c = index % C; \ + do_max_pooling_w5x5_s2x2_gi( \ + static_cast(src_ptr.get_ptr()) + n * C * IH * IW + \ + c * IH * IW, \ + static_cast(dst_ptr.get_ptr()) + n * C * OH * OW + \ + c * OH * OW, \ + IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ + static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ + } \ + MIDOUT_END(); + + if (param.src_type == dtype::Float32{}) { + DISPATCH_FUNC(dt_float32, float, 0, 4); + } +#undef DISPATCH_FUNC +} + +} // namespace fallback +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/algo.h b/dnn/src/fallback/pooling/gi/algo.h new file mode 100644 index 000000000..d7dd448ef --- /dev/null +++ b/dnn/src/fallback/pooling/gi/algo.h @@ -0,0 +1,103 @@ +/** + * \file dnn/src/fallback/pooling/gi/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "src/common/utils.h" +#include "src/fallback/pooling/opr_impl.h" + +#include "pooling_helper.h" + +#include "src/naive/handle.h" +#include "src/naive/pooling/opr_impl.h" + +namespace megdnn { +namespace fallback { + +using AlgoBase = PoolingImpl::AlgoBase; + +class PoolingImpl::AlgoGiFilterxModexStride1 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_STRIDE1"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_FilterxModexStride1) +}; + +class PoolingImpl::AlgoGiFilter2ModexStride2 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_STRIDE2"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Filter2ModexStride2) +}; +class PoolingImpl::AlgoGiFilter3MaxStride2 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_FILTER3_MAX"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Filter3MaxStride2) +}; + +class PoolingImpl::AlgoGiFilter3AverageStride2 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_FILTER3_AVERAGE"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Filter3AverageStride2) +}; + +class PoolingImpl::AlgoGiFilter4MaxStride2 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_FILTER4_MAX"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Filter4MaxStride2) +}; + +class PoolingImpl::AlgoGiFilter5MaxStride2 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_FILTER5_MAX"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Filter5MaxStride2) +}; + +class PoolingImpl::AlgoGiFp32ModexStridexNCHW44 final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "GI_POOLING_FP32_MODEX_STRIDEX_NCHW44"; } + bool usable(const PoolingKernSizeParam& param) const override; + void exec(const PoolingKernParam& param) const override; + MEGDNN_DECL_ALGO_TYPE(GI_Fp32ModexStridexNCHW44) +}; + +class PoolingImpl::AlgoFallback final : public AlgoBase { +public: + AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; + const char* name() const override { return "FALLBACK_NOT_GI_POOLING"; } + bool usable(const PoolingKernSizeParam&) const override { return true; } + void exec(const PoolingKernParam& /*param*/) const override { + megdnn_assert(false, "code issue happened!!"); + } + MEGDNN_DECL_ALGO_TYPE(FallbackNotGI) +}; +WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam&); + +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp b/dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp similarity index 93% rename from dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp rename to dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp index 17e2d71e2..a405d60da 100644 --- a/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp +++ b/dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp @@ -1,5 +1,5 @@ /** - * \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp + * \file dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -10,17 +10,17 @@ * implied. */ +#include "algo.h" +#include "kern_fp32_pooling_nchw44.h" #include "megdnn/opr_param_defs.h" -#include "src/arm_common/pooling/algo.h" -#include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h" #include "midout.h" -MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44) +MIDOUT_DECL(megdnn_fallback_fp32_pooling_nchw44) namespace megdnn { -namespace arm_common { -bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable( +namespace fallback { +bool PoolingImpl::AlgoGiFp32ModexStridexNCHW44::usable( const PoolingKernSizeParam& param) const { uint32_t sh = param.stride[0]; uint32_t sw = param.stride[1]; @@ -37,7 +37,7 @@ bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable( return avaible && size_ok; } -void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( +void PoolingImpl::AlgoGiFp32ModexStridexNCHW44::exec( const PoolingKernParam& param) const { int ih = param.isz[0]; int iw = param.isz[1]; @@ -55,7 +55,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( #define DISPATCH_FUNC(filter, stride, mode) \ MIDOUT_BEGIN( \ - megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0), \ + megdnn_fallback_fp32_pooling_nchw44, midout_iv(0), \ midout_iv(#filter #stride #mode##_hash)) { \ auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \ const int c_idx = index; \ @@ -135,7 +135,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( #undef DISPATCH_FUNC } -} // namespace arm_common +} // namespace fallback } // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp new file mode 100644 index 000000000..f4663cc08 --- /dev/null +++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp @@ -0,0 +1,157 @@ +/** + * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" + +#include +#include +#include "do_max_pooling_3x3_s2x2_float.h" +#include "src/common/macro_helper.h" + +namespace megdnn { +namespace fallback { + +#define GI_UZP(s0, s1, d0, d1) \ + do { \ + auto tmp__ = GiUzpqFloat32(s0, s1); \ + d0 = tmp__.val[0]; \ + d1 = tmp__.val[1]; \ + } while (0) + +void do_max_pooling_3x3_s2x2_float_gi( + const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, + size_t PH_, size_t PW_, const WorkspaceBundle& ws) { + int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_; + // cache[i] stores the answer of the i-th line after + // pooling along the W dimension. + float* cache[3] = { + static_cast(ws.get(0)), static_cast(ws.get(1)), + static_cast(ws.get(2))}; + float* odd = static_cast(ws.get(3)); + float* even = static_cast(ws.get(4)); + int ih_next = 0; + // "good" area means we can use SIMD to accelerate. + auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) { + // x*2 - P >= 0; 2x >= P; x >= P/2 + O_from = (P + 1) / 2; + // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2 + O_to = (I + P - 3) / 2 + 1; + // we must have I >= 2 to ensure O_from <= O_to + }; + int OW_from, OW_to; + get_good_area(IW, OW, PW, OW_from, OW_to); + auto process_cache = [&](int ih) { + const float* __restrict sptr = src + ih * IW; + auto tmp = cache[2]; + cache[2] = cache[1]; + cache[1] = cache[0]; + cache[0] = tmp; + // cache 0 is used to store the current answer. + auto run_single = [&](int ow) { + int iw = ow * 2 - PW; + float res = std::numeric_limits::lowest(); + if (iw + 0 >= 0 && iw + 0 < IW) { + res = std::max(res, sptr[iw + 0]); + } + if (iw + 1 >= 0 && iw + 1 < IW) { + res = std::max(res, sptr[iw + 1]); + } + if (iw + 2 >= 0 && iw + 2 < IW) { + res = std::max(res, sptr[iw + 2]); + } + cache[0][ow] = res; + }; + // build odd/even + int iw = 0; + int odd_offset = 0, even_offset = 0; + + for (; iw + 2 * 4 <= IW; iw += 2 * 4) { + GI_FLOAT32_t s0, s1, d0, d1; + s0 = GiLoadFloat32(sptr + iw); + s1 = GiLoadFloat32(sptr + iw + 4); + GI_UZP(s0, s1, d0, d1); + GiStoreFloat32(even + even_offset, d0); + GiStoreFloat32(odd + odd_offset, d1); + even_offset += 4; + odd_offset += 4; + } + for (; iw < IW; ++iw) { + if (iw & 1) + odd[odd_offset++] = sptr[iw]; + else + even[even_offset++] = sptr[iw]; + } + int ow = 0; + for (; ow < OW_from; ++ow) + run_single(ow); + if (PW & 1) { + for (; ow + 4 <= OW_to; ow += 4) { + GI_FLOAT32_t d, s0, s1, s2; + s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); + s1 = GiLoadFloat32(even + ow - (PW >> 1)); + s2 = GiLoadFloat32(odd + ow - (PW >> 1)); + d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); + GiStoreFloat32(cache[0] + ow, d); + } + } else { + for (; ow + 4 <= OW_to; ow += 4) { + GI_FLOAT32_t d, s0, s1, s2; + s0 = GiLoadFloat32(even + ow - (PW >> 1)); + s1 = GiLoadFloat32(odd + ow - (PW >> 1)); + s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); + d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); + GiStoreFloat32(cache[0] + ow, d); + } + } + for (; ow < OW; ++ow) + run_single(ow); + }; + for (int oh = 0; oh < OH; ++oh) { + float* __restrict dptr = dst + oh * OW; + int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); + int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3)); + while (ih_next < ih_to) { + process_cache(ih_next++); + } + if (ih_to - ih_from == 3) { + int ow = 0; + for (; ow + 4 <= OW; ow += 4) { + GI_FLOAT32_t d, s0, s1, s2; + s0 = GiLoadFloat32(cache[0] + ow); + s1 = GiLoadFloat32(cache[1] + ow); + s2 = GiLoadFloat32(cache[2] + ow); + d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); + GiStoreFloat32(dptr + ow, d); + } + for (; ow < OW; ++ow) { + dptr[ow] = std::max(std::max(cache[0][ow], cache[1][ow]), cache[2][ow]); + } + } else { + std::memcpy(dptr, cache[0], sizeof(float) * OW); + for (int i = 1; i < ih_to - ih_from; ++i) { + int ow = 0; + for (; ow + 4 <= OW; ow += 4) { + GI_FLOAT32_t d, s; + s = GiLoadFloat32(cache[i] + ow); + d = GiLoadFloat32(dptr + ow); + d = GiMaximumFloat32(d, s); + GiStoreFloat32(dptr + ow, d); + } + for (; ow < OW; ++ow) { + dptr[ow] = std::max(dptr[ow], cache[i][ow]); + } + } + } + } +} + +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h new file mode 100644 index 000000000..8ac5719a8 --- /dev/null +++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h @@ -0,0 +1,26 @@ +/** + * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/common/utils.h" + +#include "megdnn/arch.h" + +#include "src/fallback/general_intrinsic/gi_float.h" + +namespace megdnn { +namespace fallback { + +void do_max_pooling_3x3_s2x2_float_gi( + const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, + size_t PH_, size_t PW_, const WorkspaceBundle& ws); + +} // namespace fallback +} // namespace megdnn diff --git a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp new file mode 100644 index 000000000..1401533b1 --- /dev/null +++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp @@ -0,0 +1,89 @@ +/** + * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "do_max_pooling_w4x4_s2x2.h" +#include "pooling_helper.h" + +namespace megdnn { +namespace fallback { + +void do_max_pooling_w4x4_s2x2_float_gi( + const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH, + const int IW, const int OH, const int OW, const int PH, const int PW) { + const int window = 4; + const int stride = 2; + using Pooler = MaxPooler<16, dt_float32, float, float>; + int oh = 0; + for (; oh < OH && -PH + stride * oh < 0; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) { + int ow = 0; + for (; ow < OW && -PW + stride * ow < 0; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + dt_float32 last_hf_res = -std::numeric_limits::infinity(); + int ih = -PH + stride * oh, iw = -PW + stride * ow; + if (-PW + stride * ow + window <= IW) { + GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw), + i1 = GiLoadFloat32(src + (ih + 1) * IW + iw), + i2 = GiLoadFloat32(src + (ih + 2) * IW + iw), + i3 = GiLoadFloat32(src + (ih + 3) * IW + iw); + GI_FLOAT32_t sum0 = GiMaximumFloat32( + GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3)); + float32x2_t t = + GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); + dst[oh * OW + ow] = + std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1)); + last_hf_res = GiGetLaneFloat32(t, 1); + ow += 1; + } + for (; ow + 1 < OW && -PW + stride * (ow + 1) + window <= IW; ow += 2) { + iw = -PW + stride * (ow + 1); + GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw), + i1 = GiLoadFloat32(src + (ih + 1) * IW + iw), + i2 = GiLoadFloat32(src + (ih + 2) * IW + iw), + i3 = GiLoadFloat32(src + (ih + 3) * IW + iw); + GI_FLOAT32_t sum0 = GiMaximumFloat32( + GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3)); + float32x2_t t = + GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); + dst[oh * OW + ow + 0] = std::max(GiGetLaneFloat32(t, 0), last_hf_res); + dst[oh * OW + ow + 1] = + std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1)); + last_hf_res = GiGetLaneFloat32(t, 1); + } + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } +} + +} // namespace fallback +} // namespace megdnn +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h new file mode 100644 index 000000000..83266461f --- /dev/null +++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h @@ -0,0 +1,24 @@ +/** + * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "src/fallback/pooling/opr_impl.h" + +namespace megdnn { +namespace fallback { + +void do_max_pooling_w4x4_s2x2_float_gi( + const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH, + const int IW, const int OH, const int OW, const int PH, const int PW); +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h b/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h new file mode 100644 index 000000000..742148da0 --- /dev/null +++ b/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h @@ -0,0 +1,306 @@ +/** + * \file dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include +#include "megdnn/opr_param_defs.h" +#include "src/common/unroll_macro.h" +#include "src/fallback/general_intrinsic/gi_float.h" +#include "src/fallback/gi_intrinsic_helper.h" + +namespace megdnn { +namespace fallback { +namespace { + +template < + int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1, + typename T2> +struct CalXsXNchw44 { + static void impl(T1 result, T2 src); +}; + +struct GiD1Qf32 { + static GI_FORCEINLINE GI_FLOAT32_t impl(const float32_t* ptr) { + return GiLoadFloat32(ptr); + } +}; + +template < + int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1, + typename T2> +void calculate_xsx_nchw44(T1 result, T2 src) { + CalXsXNchw44::impl(result, src); +}; + +#define CALCULATE_MAX_CB(step) \ + result[0] = GiMaximumFloat32(result[0], src[0 * stride + step]); \ + result[1] = GiMaximumFloat32(result[1], src[1 * stride + step]); \ + result[2] = GiMaximumFloat32(result[2], src[2 * stride + step]); \ + result[3] = GiMaximumFloat32(result[3], src[3 * stride + step]); + +#define CALCULATE_AVG_CB(step) \ + result[0] = GiAddFloat32(result[0], src[0 * stride + step]); \ + result[1] = GiAddFloat32(result[1], src[1 * stride + step]); \ + result[2] = GiAddFloat32(result[2], src[2 * stride + step]); \ + result[3] = GiAddFloat32(result[3], src[3 * stride + step]); + +#define INSTANCE_CAL(filter) \ + template \ + struct CalXsXNchw44 { \ + static void impl(T1 result, T2 src) { \ + UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB); \ + } \ + }; \ + template \ + struct CalXsXNchw44 { \ + static void impl(T1 result, T2 src) { \ + UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB); \ + } \ + }; + +INSTANCE_CAL(2) +INSTANCE_CAL(3) +INSTANCE_CAL(4) +INSTANCE_CAL(5) +INSTANCE_CAL(9) +INSTANCE_CAL(13) + +#undef INSTANCE_CAL +#undef CALCULATE_AVG_CB +#undef CALCULATE_MAX_CB + +template +struct KerPoolingFilterXStrideXNchw44 { + static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw); +}; + +template +struct KerPoolingFilterXStrideXNchw44 { + static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { + constexpr int src_reg_size = ow_step * stride + filter - stride; + constexpr int packed_ic = 4; + constexpr int simd_len = 4; + constexpr float default_float = std::numeric_limits::lowest(); + GI_FLOAT32_t result[ow_step]; + GI_FLOAT32_t src[src_reg_size]; + + result[0] = GiBroadcastFloat32(default_float); + result[1] = GiBroadcastFloat32(default_float); + result[2] = GiBroadcastFloat32(default_float); + result[3] = GiBroadcastFloat32(default_float); + + for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { + load_helper( + src, src_ptr + fh_idx * iw * packed_ic, 0); + calculate_xsx_nchw44( + result, src); + } + + GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]); + GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]); + GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]); + GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]); + } +}; + +template +struct KerPoolingFilterXStrideXNchw44< + filter, stride, ow_step, PoolingBase::Mode::AVERAGE> { + static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { + constexpr int src_reg_size = ow_step * stride + filter - stride; + constexpr int packed_ic = 4; + constexpr int simd_len = 4; + constexpr float default_float = 0; + constexpr float div_filter_size = 1.f / (filter * filter); + const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size); + GI_FLOAT32_t result[ow_step]; + GI_FLOAT32_t src[src_reg_size]; + + result[0] = GiBroadcastFloat32(default_float); + result[1] = GiBroadcastFloat32(default_float); + result[2] = GiBroadcastFloat32(default_float); + result[3] = GiBroadcastFloat32(default_float); + + for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { + load_helper( + src, src_ptr + fh_idx * iw * packed_ic, 0); + calculate_xsx_nchw44( + result, src); + } + result[0] = GiMultiplyFloat32(result[0], div_filter_size_vec); + result[1] = GiMultiplyFloat32(result[1], div_filter_size_vec); + result[2] = GiMultiplyFloat32(result[2], div_filter_size_vec); + result[3] = GiMultiplyFloat32(result[3], div_filter_size_vec); + GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]); + GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]); + GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]); + GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]); + } +}; + +template +void ker_pooling_nchw44_remain_pad( + const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, + const int pad_bottom, const int pad_left, const int pad_right, + const int filter); +template <> +void ker_pooling_nchw44_remain_pad( + const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, + const int pad_bottom, const int pad_left, const int pad_right, + const int filter) { + constexpr int ic_step = 4; + const int ih_end = filter - pad_bottom; + const int iw_end = filter - pad_right; + GI_FLOAT32_t result = GiBroadcastFloat32(std::numeric_limits::lowest()); + for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { + for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { + GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step); + result = GiMaximumFloat32(result, src); + } + src_ptr += iw * ic_step; + } + GiStoreFloat32(dst_ptr, result); +} + +template <> +void ker_pooling_nchw44_remain_pad( + const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, + const int pad_bottom, const int pad_left, const int pad_right, + const int filter) { + constexpr int ic_step = 4; + const int ih_end = filter - pad_bottom; + const int iw_end = filter - pad_right; + const float div_filter_size = 1.f / (filter * filter); + const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size); + GI_FLOAT32_t result = GiBroadcastFloat32(0.f); + + for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { + for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { + GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step); + result = GiAddFloat32(result, src); + } + src_ptr += iw * ic_step; + } + result = GiMultiplyFloat32(result, div_filter_size_vec); + GiStoreFloat32(dst_ptr, result); +} + +template +static inline void kern_pooling_with_pad_nchw44( + const float32_t* src, float32_t* dst, const int filter, const int ow_start, + const int ow_end, const int iw, const int ow, const int stride_w, const int pw, + const int real_ih_idx, const int oh_idx, const int pad_top, + const int pad_bottom) { + constexpr int ic_step = 4; + constexpr int oc_step = 4; + for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) { + const int iw_idx = ow_idx * stride_w; + const int real_iw_idx = std::max(iw_idx - pw, 0); + const int pad_left = std::max(0, pw - iw_idx); + const int pad_right = std::max(0, iw_idx - pw + filter - iw); + const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step; + const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; + ker_pooling_nchw44_remain_pad( + src + src_offset, dst + dst_offset, iw, pad_top, pad_bottom, pad_left, + pad_right, filter); + } +} + +template +static inline void pooling_fp32_nchw44_pad( + const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph, + int pw) { + constexpr int stride_h = stride; + constexpr int stride_w = stride; + constexpr int ic_step = 4; + constexpr int oc_step = 4; + constexpr int ow_step = 4; + const int ow_pad_left_end = div_ceil(pw, stride_w); + const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w; + const int ow_pad_right_step_end = + (ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step + ow_pad_left_end; + + rep(oh_idx, oh) { + const int ih_idx = oh_idx * stride_h; + const int real_ih_idx = std::max(ih_idx - ph, 0); + const int pad_top = std::max(0, ph - ih_idx); + const int pad_bottom = std::max(0, ih_idx - ph + filter - ih); + if (pad_top > 0 || pad_bottom > 0) { + kern_pooling_with_pad_nchw44( + src, dst, filter, 0, ow, iw, ow, stride_w, pw, real_ih_idx, oh_idx, + pad_top, pad_bottom); + + } else { + kern_pooling_with_pad_nchw44( + src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw, + real_ih_idx, oh_idx, pad_top, pad_bottom); + for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end; + ow_idx += ow_step) { + const int iw_idx = ow_idx * stride_w; + const int real_iw_idx = std::max(iw_idx - pw, 0); + const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step; + const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; + KerPoolingFilterXStrideXNchw44::impl( + src + src_offset, dst + dst_offset, iw); + } + kern_pooling_with_pad_nchw44( + src, dst, filter, ow_pad_right_step_end, ow, iw, ow, stride_w, pw, + real_ih_idx, oh_idx, pad_top, pad_bottom); + } + } +} + +template +static inline void pooling_fp32_nchw44_no_pad( + const float32_t* src, float32_t* dst, int, int iw, int oh, int ow) { + constexpr int stride_h = stride; + constexpr int stride_w = stride; + constexpr int ic_step = 4; + constexpr int oc_step = 4; + constexpr int ow_step = 4; + const int ow_end = ow / ow_step * ow_step; + const int ow_remain = ow - ow_end; + + rep(oh_idx, oh) { + const int ih_idx = oh_idx * stride_h; + const int src_ih_offset = ih_idx * iw; + const int dst_oh_offset = oh_idx * ow; + for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { + const int iw_idx = ow_idx * stride_w; + const int src_offset = (src_ih_offset + iw_idx) * ic_step; + const int dst_offset = (dst_oh_offset + ow_idx) * oc_step; + KerPoolingFilterXStrideXNchw44::impl( + src + src_offset, dst + dst_offset, iw); + } + if (ow_remain > 0) { + kern_pooling_with_pad_nchw44( + src, dst, filter, ow_end, ow, iw, ow, stride_w, 0, ih_idx, oh_idx, + 0, 0); + } + } +} + +template +static inline void pooling_fp32_nchw44( + const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph, + int pw) { + if (ph > 0 || pw > 0) { + pooling_fp32_nchw44_pad(src, dst, ih, iw, oh, ow, ph, pw); + } else { + pooling_fp32_nchw44_no_pad(src, dst, ih, iw, oh, ow); + } +} + +} // namespace +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/gi/pooling_helper.h b/dnn/src/fallback/pooling/gi/pooling_helper.h new file mode 100644 index 000000000..0be6942dc --- /dev/null +++ b/dnn/src/fallback/pooling/gi/pooling_helper.h @@ -0,0 +1,572 @@ +/** + * \file dnn/src/fallback/pooling/gi/pooling_helper.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "do_max_pooling_3x3_s2x2_float.h" +#include "megdnn/dtype.h" +#include "src/common/unroll_macro.h" +#include "src/common/utils.h" + +namespace { + +/* ======================= MeanPooler ======================== */ +using namespace megdnn; +/** + * \brief Mean mode for pooling + * \tparam area the pooling area size, FH * FW + * \tparam dtype the input type + * \tparam ctype the inner raw type + * \tparam comp_type compute type + */ +template +struct MeanPoolerCommon { + //! the gi imp register size is 16 bytes(128 bits) + static constexpr int SIMD_WIDTH = 16 / sizeof(ctype); + static constexpr comp_type coef = static_cast(1.0f) / area; + comp_type res; + MeanPoolerCommon() : res(0) {} + void feed(const ctype* val) { res += *val; } +}; +template +constexpr comp_type MeanPoolerCommon::coef; + +template +struct MeanInPooler : MeanPoolerCommon { + using ctype = _ctype; + //! `MIDOUT_CASE_NUM` is a unique int id + static constexpr int MIDOUT_CASE_NUM = 1; + MeanInPooler(DType) : MeanPoolerCommon() {} + void post(ctype* dst) { + this->res *= this->coef; + *dst = this->res; + } +}; + +template +struct MeanInRoundPooler : MeanPoolerCommon { + using ctype = _ctype; + void post(ctype* dst) { + this->res *= this->coef; + *dst = std::round(this->res); + } +}; + +template +struct GiMeanPooler; + +template +struct GiMeanPooler { + using ctype = float; + static constexpr int MIDOUT_CASE_NUM = 1; + static constexpr int SIMD_WIDTH = 4; + + static const GI_FLOAT32_t coef; + GI_FLOAT32_t res; + GiMeanPooler(DType) : res(GiBroadcastFloat32(0.0f)) {} + void feed(const float* val) { res = GiAddFloat32(res, GiLoadFloat32(val)); } + void post(float* dst) { + res = GiMultiplyFloat32(res, coef); + GiStoreFloat32(dst, res); + } +}; +template +const GI_FLOAT32_t GiMeanPooler::coef = + GiBroadcastFloat32(1.0f / area); + +/* ======================= MaxPooler ======================== */ + +template +struct MaxPooler { + using ctype = _ctype; + static constexpr int MIDOUT_CASE_NUM = 11; + static constexpr int SIMD_WIDTH = 16 / sizeof(ctype); + + static const ctype outsider; + ctype res; + MaxPooler(DType) : res(DTypeTrait::min()) {} + void feed(const ctype* val) { res = std::max(res, *val); } + void post(ctype* dst) { *dst = res; } +}; +template +const ctype MaxPooler::outsider = + DTypeTrait::min(); + +template +struct GiMaxPooler; + +template +struct GiMaxPooler { + using ctype = float; + static constexpr int MIDOUT_CASE_NUM = 11; + static constexpr int SIMD_WIDTH = 4; + + GI_FLOAT32_t res; + GiMaxPooler(DType) : res(GiBroadcastFloat32(DTypeTrait::min())) {} + void feed(const float* val) { res = GiMaximumFloat32(res, GiLoadFloat32(val)); } + void post(float* dst) { GiStoreFloat32(dst, res); } +}; + +template +void do_pxl_naive( + int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst, + DType src_dtype, const int IH, const int IW, const int OH, const int OW, + const int PH, const int PW, const int SH, const int SW) { + MEGDNN_MARK_USED_VAR(OH); + Pooler pooler(src_dtype); + rep(wh, window) rep(ww, window) { + int ih = -PH + oh * SH + wh; + int iw = -PW + ow * SW + ww; + if (ih >= 0 && iw >= 0 && ih < IH && iw < IW) { + pooler.feed(src + ih * IW + iw); + } + } + pooler.post(dst + oh * OW + ow); +} + +namespace detail { + +template +struct do_pxl_2x2_pack_proxy { + static void gao( + int oh, int ow, const typename Pooler::ctype* src, + typename Pooler::ctype* dst, DType, const int IH, const int IW, + const int OH, const int OW, const int PH, const int PW); +}; + +template <> +struct do_pxl_2x2_pack_proxy< + MeanInPooler<4, dt_float32, float, float>, Pooling::Mode::AVERAGE> { + static void gao( + int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH, + const int IW, const int OH, const int OW, const int PH, const int PW) { + MEGDNN_MARK_USED_VAR(IH); + MEGDNN_MARK_USED_VAR(OH); + static const auto avg_coef = GiBroadcastFloat32(0.25f); + int ih = -PH + 2 * oh; + int iw = -PW + 2 * ow; + auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)), + i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)), + i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)), + i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4)); + auto sum0 = GiAddFloat32(i00, i10), sum1 = GiAddFloat32(i01, i11); + auto vlow = GiPaddFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); + auto vhigh = GiPaddFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1)); + auto comb = GiCombineFloat32(vlow, vhigh); + auto result = GiMultiplyFloat32(comb, avg_coef); + GiStoreFloat32(dst + oh * OW + ow, result); + } +}; + +template <> +struct do_pxl_2x2_pack_proxy< + MaxPooler<4, dt_float32, float, float>, Pooling::Mode::MAX> { + static void gao( + int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH, + const int IW, const int OH, const int OW, const int PH, const int PW) { + MEGDNN_MARK_USED_VAR(IH); + MEGDNN_MARK_USED_VAR(OH); + int ih = -PH + 2 * oh; + int iw = -PW + 2 * ow; + auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)), + i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)), + i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)), + i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4)); + auto sum0 = GiMaximumFloat32(i00, i10), sum1 = GiMaximumFloat32(i01, i11); + auto vlow = GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); + auto vhigh = GiPmaxFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1)); + auto comb = GiCombineFloat32(vlow, vhigh); + GiStoreFloat32(dst + oh * OW + ow, comb); + } +}; + +} // namespace detail + +template +void do_pxl_2x2_pack( + int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst, + DType src_dtype, const int IH, const int IW, const int OH, const int OW, + const int PH, const int PW) { + detail::do_pxl_2x2_pack_proxy::gao( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); +} + +template +void do_pxl_compact_packed( + int oh, int ow, const typename GiPooler::ctype* src, + typename GiPooler::ctype* dst, DType src_dtype, const int IH, const int IW, + const int OH, const int OW, const int PH, const int PW) { + MEGDNN_MARK_USED_VAR(IH); + MEGDNN_MARK_USED_VAR(OH); + GiPooler pooler(src_dtype); + rep(wh, window) rep(ww, window) { + int ih = -PH + oh + wh; + int iw = -PW + ow + ww; + pooler.feed(src + ih * IW + iw); + } + pooler.post(dst + oh * OW + ow); +} + +template +void do_pooling_compact( + const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype, + const int IH, const int IW, const int OH, const int OW, const int PH, + const int PW) { + static_assert( + std::is_same::value, + "ctype of Pooler and GiPooler is not the same"); + const int stride = 1; + int oh = 0; + for (; oh < OH && oh - PH < 0; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH && oh - PH + window <= IH; ++oh) { + int ow = 0; + for (; ow < OW && ow - PW < 0; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + for (; ow + GiPooler::SIMD_WIDTH <= OW && + ow + GiPooler::SIMD_WIDTH - 1 - PW + window <= IW; + ow += GiPooler::SIMD_WIDTH) { + do_pxl_compact_packed( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); + } + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } +} + +template +void do_pooling_2x2( + const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype, + const int IH, const int IW, const int OH, const int OW, const int PH, + const int PW) { + const int window = 2; + const int stride = 2; + int oh = 0; + for (; oh < OH && -PH + stride * oh < 0; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) { + int ow = 0; + for (; ow < OW && -PW + stride * ow < 0; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + for (; ow + Pooler::SIMD_WIDTH <= OW && + -PW + stride * (ow + Pooler::SIMD_WIDTH - 1) + window <= IW; + ow += Pooler::SIMD_WIDTH) { + do_pxl_2x2_pack( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); + } + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } + for (; oh < OH; ++oh) { + int ow = 0; + for (; ow < OW; ++ow) { + do_pxl_naive( + oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, + stride); + } + } +} + +template +void do_max_pooling_w5x5_s2x2_gi( + const ctype* src, ctype* dst, const int IH, const int IW, const int OH, + const int OW, const int PH, const int PW, const WorkspaceBundle& ws, + const int MEGDNN_SIMD_WIDTH) { + ctype* cache[5] = { + static_cast(ws.get(0)), static_cast(ws.get(1)), + static_cast(ws.get(2)), static_cast(ws.get(3)), + static_cast(ws.get(4))}; + ctype* odd = static_cast(ws.get(5)); + ctype* even = static_cast(ws.get(6)); + int ih_next = 0; + int OW_from = (PW + 1) / 2, OW_to = (IW + PW - 5) / 2 + 1; + auto process_cache = [&](int ih) { + const ctype* __restrict sptr = src + ih * IW; + auto tmp = cache[4]; + for (auto i = 4; i >= 1; --i) + cache[i] = cache[i - 1]; + cache[0] = tmp; + auto run_single = [&](int ow) { + int iw = ow * 2 - PW; + ctype res = std::numeric_limits::lowest(); + for (auto i = 0; i < 5; ++i) + if (iw + i >= 0 && iw + i < IW) + res = std::max(res, sptr[iw + i]); + cache[0][ow] = res; + }; + int iw = 0; + int odd_offset = 0, even_offset = 0; + for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(sptr + iw + 0); + auto s1 = GiLoadFloat32(sptr + iw + MEGDNN_SIMD_WIDTH); + auto d = GiUzpqFloat32(s0, s1); + GiStoreFloat32(even + even_offset, d.val[0]); + GiStoreFloat32(odd + odd_offset, d.val[1]); + even_offset += MEGDNN_SIMD_WIDTH; + odd_offset += MEGDNN_SIMD_WIDTH; + } + for (; iw < IW; ++iw) { + if (iw & 1) + odd[odd_offset++] = sptr[iw]; + else + even[even_offset++] = sptr[iw]; + } + int ow = 0; + for (; ow < OW_from; ++ow) + run_single(ow); + if (PW & 1) { + for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); + auto s1 = GiLoadFloat32(even + ow - (PW >> 1)); + auto s2 = GiLoadFloat32(odd + ow - (PW >> 1)); + auto s3 = GiLoadFloat32(even + ow - (PW >> 1) + 1); + auto s4 = GiLoadFloat32(odd + ow - (PW >> 1) + 1); + auto d = GiMaximumFloat32( + s0, + GiMaximumFloat32( + GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); + GiStoreFloat32(cache[0] + ow, d); + } + } else { + for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(even + ow - (PW >> 1)); + auto s1 = GiLoadFloat32(odd + ow - (PW >> 1)); + auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); + auto s3 = GiLoadFloat32(odd + ow - (PW >> 1) + 1); + auto s4 = GiLoadFloat32(even + ow - (PW >> 1) + 2); + auto d = GiMaximumFloat32( + s0, + GiMaximumFloat32( + GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); + GiStoreFloat32(cache[0] + ow, d); + } + } + for (; ow < OW; ++ow) + run_single(ow); + }; + + for (int oh = 0; oh < OH; ++oh) { + ctype* __restrict dptr = dst + oh * OW; + int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); + int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 5)); + while (ih_next < ih_to) + process_cache(ih_next++); + if (ih_to - ih_from == 5) { + int ow = 0; + for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(cache[0] + ow); + auto s1 = GiLoadFloat32(cache[1] + ow); + auto s2 = GiLoadFloat32(cache[2] + ow); + auto s3 = GiLoadFloat32(cache[3] + ow); + auto s4 = GiLoadFloat32(cache[4] + ow); + auto d = GiMaximumFloat32( + s0, + GiMaximumFloat32( + GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); + GiStoreFloat32(dptr + ow, d); + } + for (; ow < OW; ++ow) + dptr[ow] = std::max( + {cache[0][ow], cache[1][ow], cache[2][ow], cache[3][ow], + cache[4][ow]}); + } else { + std::memcpy(dptr, cache[0], sizeof(ctype) * OW); + for (int i = 1; i < ih_to - ih_from; ++i) { + int ow = 0; + for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + auto s = GiLoadFloat32(cache[i] + ow); + auto d = GiLoadFloat32(dptr + ow); + d = GiMaximumFloat32(d, s); + GiStoreFloat32(dptr + ow, d); + } + for (; ow < OW; ++ow) + dptr[ow] = std::max(dptr[ow], cache[i][ow]); + } + } + } +} + +template +void do_average_pooling_3x3_s2x2_gi( + const ctype* src, ctype* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, + size_t PH_, size_t PW_, const WorkspaceBundle& ws, + const int MEGDNN_SIMD_WIDTH) { + int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_; + // cache[i] stores the answer of the i-th line after + // pooling along the W dimension. + ctype* cache[3] = { + static_cast(ws.get(0)), static_cast(ws.get(1)), + static_cast(ws.get(2))}; + ctype* odd = static_cast(ws.get(3)); + ctype* even = static_cast(ws.get(4)); + int ih_next = 0; + // "good" area means we can use SIMD to accelerate. + auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) { + // x*2 - P >= 0; 2x >= P; x >= P/2 + O_from = (P + 1) / 2; + // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2 + O_to = (I + P - 3) / 2 + 1; + // we must have I >= 2 to ensure O_from <= O_to + }; + int OW_from, OW_to; + get_good_area(IW, OW, PW, OW_from, OW_to); + auto process_cache = [&](int ih) { + const ctype* __restrict sptr = src + ih * IW; + auto tmp = cache[2]; + cache[2] = cache[1]; + cache[1] = cache[0]; + cache[0] = tmp; + // cache 0 is used to store the current answer. + auto run_single = [&](int ow) { + int iw = ow * 2 - PW; + ctype res = 0; + if (iw + 0 >= 0 && iw + 0 < IW) { + res += sptr[iw + 0]; + } + if (iw + 1 >= 0 && iw + 1 < IW) { + res += sptr[iw + 1]; + } + if (iw + 2 >= 0 && iw + 2 < IW) { + res += sptr[iw + 2]; + } + cache[0][ow] = res; + }; + // build odd/even + int iw = 0; + int odd_offset = 0, even_offset = 0; + + for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) { + auto s0 = GiLd2qFloat32(sptr + iw); + GiStoreFloat32(even + even_offset, s0.val[0]); + GiStoreFloat32(odd + odd_offset, s0.val[1]); + even_offset += MEGDNN_SIMD_WIDTH; + odd_offset += MEGDNN_SIMD_WIDTH; + } + for (; iw < IW; ++iw) { + if (iw & 1) + odd[odd_offset++] = sptr[iw]; + else + even[even_offset++] = sptr[iw]; + } + int ow = 0; + for (; ow < OW_from; ++ow) + run_single(ow); + if (PW & 1) { + for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); + auto s1 = GiLoadFloat32(even + ow - (PW >> 1)); + auto s2 = GiLoadFloat32(odd + ow - (PW >> 1)); + auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); + GiStoreFloat32(cache[0] + ow, d); + } + } else { + for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(even + ow - (PW >> 1)); + auto s1 = GiLoadFloat32(odd + ow - (PW >> 1)); + auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); + auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); + GiStoreFloat32(cache[0] + ow, d); + } + } + for (; ow < OW; ++ow) + run_single(ow); + }; + for (int oh = 0; oh < OH; ++oh) { + ctype* __restrict dptr = dst + oh * OW; + int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); + int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3)); + while (ih_next < ih_to) { + process_cache(ih_next++); + } + ctype factor = (1.0f / 9); + auto coef = GiBroadcastFloat32(factor); + if (ih_to - ih_from == 3) { + int ow = 0; + for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + auto s0 = GiLoadFloat32(cache[0] + ow); + auto s1 = GiLoadFloat32(cache[1] + ow); + auto s2 = GiLoadFloat32(cache[2] + ow); + auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); + d = GiMultiplyFloat32(d, coef); + GiStoreFloat32(dptr + ow, d); + } +#if MEGDNN_FIX_AARCH32_BUG +// FIXME: as llvm may cause cannot select error if enable vectorize +#pragma clang loop vectorize(disable) +#endif + for (; ow < OW; ++ow) { + dptr[ow] = (cache[0][ow] + cache[1][ow] + cache[2][ow]) * factor; + } + } else { + std::memcpy(dptr, cache[0], sizeof(ctype) * OW); + int i = 1; + for (; i < ih_to - ih_from; ++i) { + int ow = 0; + for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + auto s = GiLoadFloat32(cache[i] + ow); + auto d = GiLoadFloat32(dptr + ow); + d = GiAddFloat32(d, s); + GiStoreFloat32(dptr + ow, d); + } + for (; ow < OW; ++ow) { + dptr[ow] = (dptr[ow] + cache[i][ow]); + } + } + int ow = 0; + for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { + auto d = GiLoadFloat32(dptr + ow); + d = GiMultiplyFloat32(d, coef); + GiStoreFloat32(dptr + ow, d); + } +#if MEGDNN_FIX_AARCH32_BUG +// FIXME: as llvm may cause cannot select error if enable vectorize +#pragma clang loop vectorize(disable) +#endif + for (; ow < OW; ++ow) { + dptr[ow] *= factor; + } + } + } +} +} // anonymous namespace + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/opr_impl.cpp b/dnn/src/fallback/pooling/opr_impl.cpp index 3e9f85245..8a43db461 100644 --- a/dnn/src/fallback/pooling/opr_impl.cpp +++ b/dnn/src/fallback/pooling/opr_impl.cpp @@ -6,18 +6,186 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/fallback/pooling/opr_impl.h" - -#include -#include "src/common/utils.h" -#include "src/naive/handle.h" +#include "src/common/algo_chooser.h" +#include "src/common/metahelper.h" +#include "src/fallback/pooling/gi/algo.h" #include "midout.h" MIDOUT_DECL(megdnn_fallback_pooling) +using namespace megdnn; +using namespace fallback; + +class PoolingImpl::AlgoPack : NonCopyableObj { +private: + AlgoBase::Mapper m_all_algos_map; + AlgoGiFilterxModexStride1 algo_gi_filterx_modex_stride1; + AlgoGiFilter2ModexStride2 algo_gi_filter2_modex_stride2; + AlgoGiFilter3MaxStride2 algo_gi_filter3_max_stride2; + AlgoGiFilter3AverageStride2 algo_gi_filter3_average_stride2; + AlgoGiFilter4MaxStride2 algo_gi_filter4_max_stride2; + AlgoGiFilter5MaxStride2 algo_gi_filter5_max_stride2; + AlgoGiFp32ModexStridexNCHW44 algo_gi_fp32_modex_stridex_nchw44; + AlgoFallback algo_fallback; + +public: + AlgoPack() { + all_algos.emplace_back(&algo_gi_filterx_modex_stride1); + all_algos.emplace_back(&algo_gi_filter2_modex_stride2); + all_algos.emplace_back(&algo_gi_filter3_max_stride2); + all_algos.emplace_back(&algo_gi_filter3_average_stride2); + all_algos.emplace_back(&algo_gi_filter4_max_stride2); + all_algos.emplace_back(&algo_gi_filter5_max_stride2); + all_algos.emplace_back(&algo_gi_fp32_modex_stridex_nchw44); + all_algos.emplace_back(&algo_fallback); + + for (auto&& algo : all_algos) { + m_all_algos_map.emplace(algo->info().desc, algo); + } + } + SmallVector all_algos; + const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } +}; + +PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; + +PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param( + fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) { + auto safe_u32 = [](size_t v) -> uint32_t { + megdnn_assert( + v <= std::numeric_limits::max(), "value too large: %zu", v); + return v; + }; + return {safe_u32(src.shape[0]), + safe_u32(src.shape[1]), + {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}}, + {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}}, + {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}}, + {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}}, + {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}}, + src.dtype, + dst.dtype, + opr->handle(), + opr->param().format, + opr->param().mode}; +}; + +PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( + fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace) { + PoolingKernParam ret; + static_cast(ret) = + make_pooling_kern_szie_param(opr, src.layout, dst.layout); + ret.src_ptr = src.get_ref_ptr(); + ret.dst_ptr = dst.get_ref_ptr(); + ret.workspace_ptr = workspace.raw_ptr; + ret.workspace_size = workspace.size; + return ret; +}; + +MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl); + +std::vector PoolingImpl::get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) { + auto param = make_pooling_kern_szie_param(this, src, dst); + std::vector ret; + ret.reserve(algo_pack().all_algos.size()); + for (auto i : algo_pack().all_algos) { + if (i->usable(param)) { + ret.push_back(i); + } + } + return ret; +} + +size_t PoolingImpl::get_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& dst) { + TensorLayoutArray layouts{src, dst}; + AlgorithmCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), + &this->param(), sizeof(this->param())}; + auto rst = AlgorithmCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + + auto param = make_pooling_kern_szie_param(this, src, dst); + auto algo = static_cast(fallback::PoolingImpl::get_algorithm_heuristic( + src, dst, std::numeric_limits::max(), AlgoAttribute::DEFAULT, + AlgoAttribute::DEFAULT)); + if (!is_fallback_non_gi_algo(algo)) { + size_t fallback_gi_workspace = 0; + + //! When multi-thread, every thread has its own workspace + size_t nr_threads = static_cast(handle()) + ->megcore_dispatcher() + ->nr_threads(); + if (param.src_type.category() == DTypeCategory::FLOAT && + param.filter[0] == param.filter[1] && + (param.filter[0] == 3 || param.filter[0] == 5) && + param.format == Param::Format::NCHW && + (param.mode == Mode::MAX || + (param.mode == Mode::AVERAGE && param.filter[0] == 3)) && + param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && + param.isz[1] >= 2) { + WorkspaceBundle ws = get_bundle(param); + fallback_gi_workspace = ws.total_size_in_bytes() * nr_threads; + } + + return fallback_gi_workspace; + } else { + auto naive_worksapce = + naive::PoolingForwardImpl::get_workspace_in_bytes(src, dst); + return naive_worksapce; + } +} + +void PoolingImpl::exec( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, dst.layout, workspace.size); + auto param = make_pooling_kern_param(this, src, dst, workspace); + auto algo = static_cast(fallback::PoolingImpl::get_algorithm_heuristic( + src.layout, dst.layout, std::numeric_limits::max(), + AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT)); + if (!is_fallback_non_gi_algo(algo)) { + algo->exec(param); + } else { + exec_fallback(src, dst, workspace); + } +} + +std::vector PoolingImpl::get_all_algorithms_safe( + const TensorLayout& src, const TensorLayout& dst) { + auto ret_safe = get_all_algorithms(src, dst); + megdnn_assert(!ret_safe.empty(), "no usable pooling fwd algorithm"); + return ret_safe; +} + +Algorithm* PoolingImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); + + auto param = make_pooling_kern_szie_param(this, src, dst); + for (auto&& iter : sm_algo_pack.all_algos) { + if (iter->is_available_attribute(param, positive_attr, negative_attr)) { + return iter; + } + } + megdnn_throw(ssprintf( + "require algorithm with attribute(%s) and without " + "attribute(%s), but can't get suitable algo.\n", + Algorithm::attribute_str(positive_attr).c_str(), + Algorithm::attribute_str(negative_attr).c_str())); + return nullptr; +} +//! fallback not gi imp namespace megdnn { namespace fallback { namespace pooling { @@ -140,9 +308,6 @@ void w2x2_s2x2_avg_int8( } // namespace fallback } // namespace megdnn -namespace megdnn { -namespace fallback { - void PoolingImpl::exec_w3x3_s1x1( _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param) { auto N = src.layout.shape[0], C = src.layout.shape[1]; @@ -179,7 +344,7 @@ void PoolingImpl::exec_w2x2_s2x2_avg_int8( } } -void PoolingImpl::exec( +void PoolingImpl::exec_fallback( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { Param param = this->param(); check_exec(src.layout, dst.layout, workspace.size); @@ -219,7 +384,4 @@ void PoolingImpl::exec( naive::PoolingForwardImpl::exec(src, dst, workspace); } -} // namespace fallback -} // namespace megdnn - // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/pooling/opr_impl.h b/dnn/src/fallback/pooling/opr_impl.h index a44277d8a..de0d12ffb 100644 --- a/dnn/src/fallback/pooling/opr_impl.h +++ b/dnn/src/fallback/pooling/opr_impl.h @@ -10,6 +10,7 @@ * implied. */ #pragma once +#include #include "megdnn/oprs/base.h" #include "src/naive/pooling/opr_impl.h" @@ -17,19 +18,143 @@ namespace megdnn { namespace fallback { class PoolingImpl : public naive::PoolingForwardImpl { +private: + class AlgoGiFilterxModexStride1; + class AlgoGiFilter2ModexStride2; + class AlgoGiFilter3MaxStride2; + class AlgoGiFilter3AverageStride2; + class AlgoGiFilter4MaxStride2; + class AlgoGiFilter5MaxStride2; + class AlgoGiFp32ModexStridexNCHW44; + class AlgoFallback; + class AlgoPack; + static AlgoPack sm_algo_pack; + + void exec_w3x3_s1x1( + _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param); + void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); + void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); + public: using naive::PoolingForwardImpl::PoolingForwardImpl; using Param = param::Pooling; + void exec( _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; -private: - void exec_w3x3_s1x1( - _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param); - void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); - void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); + void exec_fallback( + _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace); + + size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override; + + static size_t constexpr MAX_SPATIAL_DIM = 2; + + struct PoolingKernSizeParam { + uint32_t n, ic; + std::array isz, osz; + std::array padding, filter, stride; + DType src_type, dst_type; + Handle* handle; + Param::Format format; + Mode mode; + }; + + struct PoolingKernParam : public PoolingKernSizeParam { + RefPtr src_ptr; + RefPtr dst_ptr; + void* workspace_ptr; + size_t workspace_size; + + template + const T* src() const { + src_type.assert_is_compatible_ctype(); + return static_cast(src_ptr.get_ptr()); + } + + template + T* dst() const { + dst_type.assert_is_compatible_ctype(); + return static_cast(dst_ptr.get_ptr()); + } + + template + T* workspace() const { + return static_cast(workspace_ptr); + } + }; + + PoolingKernSizeParam make_pooling_kern_szie_param( + fallback::PoolingImpl* opr, const TensorLayout& src, + const TensorLayout& dst); + + PoolingKernParam make_pooling_kern_param( + fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, + _megdnn_workspace workspace); + class AlgoBase : public detail::Algorithm { + public: + enum class AlgoType : uint32_t { + GI_FilterxModexStride1, + GI_Filter2ModexStride2, + GI_Filter3MaxStride2, + GI_Filter3AverageStride2, + GI_Filter4MaxStride2, + GI_Filter5MaxStride2, + GI_Filter2ModexStridexNCHW44, + GI_Filter3ModexStridexNCHW44, + GI_Filter4ModexStridexNCHW44, + GI_Filter5ModexStridexNCHW44, + GI_Fp32ModexStridexNCHW44, + FallbackNotGI + }; + + using Mapper = std::unordered_map; + AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; } + virtual ~AlgoBase() = default; + virtual bool usable(const PoolingKernSizeParam& param) const = 0; + virtual void exec(const PoolingKernParam& param) const = 0; + + uint32_t type() const override { return INVALID_ALGO_TYPE; }; + bool is_available_attribute( + const PoolingKernSizeParam& param, + const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { + return contain_attribute_all(positive_attr) && + !contain_attribute_any(negative_attr) && usable(param); + } + }; + + const char* get_algorithm_set_name() const override { + return "FALLBACK_POOLING_FORWARD"; + } + + Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; + + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) override; + std::vector get_all_algorithms_safe( + const TensorLayout& src, const TensorLayout& dst) override; + + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; + + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + return fallback::PoolingImpl::get_algorithm_heuristic( + src, dst, workspace_limit_in_bytes, positive_attr, negative_attr) + ->info(); + } + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + bool is_fallback_non_gi_algo(Algorithm* algo) { + return strcmp(algo->name(), "FALLBACK_NOT_GI_POOLING") == 0; + } }; } // namespace fallback } // namespace megdnn + // vim: syntax=cpp.doxygen diff --git a/dnn/src/x86/pooling/algo.h b/dnn/src/x86/pooling/algo.h index a24de56a8..a893c65e1 100644 --- a/dnn/src/x86/pooling/algo.h +++ b/dnn/src/x86/pooling/algo.h @@ -103,7 +103,9 @@ public: AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; const char* name() const override { return m_algo_name.c_str(); } bool is_available(const SizeArgs&) const override { return true; } - void exec(const ExecArgs&) const override {} + void exec(const ExecArgs&) const override { + megdnn_assert(false, "code issue happened!!"); + } MEGDNN_DECL_ALGO_TYPE(X86_Fallback) }; diff --git a/dnn/test/fallback/gi.cpp b/dnn/test/fallback/gi.cpp index d49e1685a..4a431912c 100644 --- a/dnn/test/fallback/gi.cpp +++ b/dnn/test/fallback/gi.cpp @@ -3161,6 +3161,44 @@ TEST_F(FALLBACK, GiGetHighFloat32) { ASSERT_EQ(*(r + 1), s0[3]); } +TEST_F(FALLBACK, GiPaddFloat32) { + float32x2_t src0, src1, ret; + std::vector s0{1.1f, -3.1415f}; + std::vector s1{2.3f, 3.14777f}; + memcpy(&src0, s0.data(), sizeof(float32x2_t)); + memcpy(&src1, s1.data(), sizeof(float32x2_t)); + + ret = GiPaddFloat32(src0, src1); + + std::vector naive; + naive.push_back(s0[0] + s0[1]); + naive.push_back(s1[0] + s1[1]); + + auto r = (float*)&ret; + ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3); + ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3); +} + +TEST_F(FALLBACK, GiPmaxFloat32) { + float32x2_t src0, src1, ret; + std::vector s0{1.1f, -3.1415f}; + std::vector s1{2.3f, 3.14777f}; + memcpy(&src0, s0.data(), sizeof(float32x2_t)); + memcpy(&src1, s1.data(), sizeof(float32x2_t)); + + ret = GiPmaxFloat32(src0, src1); + + std::vector naive; + auto t0 = MAX_NAN(s0[0], s0[1]); + auto t1 = MAX_NAN(s1[0], s1[1]); + naive.push_back(t0); + naive.push_back(t1); + + auto r = (float*)&ret; + ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3); + ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3); +} + } // namespace test } // namespace megdnn diff --git a/dnn/test/fallback/pooling.cpp b/dnn/test/fallback/pooling.cpp new file mode 100644 index 000000000..e4b213ba4 --- /dev/null +++ b/dnn/test/fallback/pooling.cpp @@ -0,0 +1,560 @@ +/** + * \file dnn/test/fallback/pooling.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ +#include "test/fallback/fixture.h" + +#include "test/common/benchmarker.h" +#include "test/common/checker.h" +#include "test/common/pooling.h" +#include "test/common/rng.h" +#include "test/common/task_record_check.h" + +namespace megdnn { +namespace test { + +namespace { +std::vector> get_nchw44_pool_args( + size_t filter, size_t stride) { + constexpr size_t ic_step = 4; + std::vector> args; + + for (size_t n : {1, 2}) + for (size_t c : {4, 8}) + for (size_t ih : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) + for (size_t iw : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) + for (size_t ph : {0, 1, 2}) + for (size_t pw : {0, 1, 2}) + for (auto mode : + {param::Pooling::Mode::MAX, + param::Pooling::Mode::AVERAGE}) + if (ih + 2 * ph >= filter && iw + 2 * pw >= filter && + filter > ph && filter > pw) { + param::Pooling param; + param.mode = mode; + param.format = param::Pooling::Format::NCHW44; + param.pad_h = ph; + param.pad_w = pw; + param.stride_h = param.stride_w = stride; + param.window_h = param.window_w = filter; + args.emplace_back(std::make_pair( + param, + TensorShapeArray{ + {n, c / ic_step, ih, iw, ic_step}, + {}})); + } + return args; +} + +void run_pooling_check( + Handle* handle, std::vector> args, + bool is_int8) { + Checker checker(handle); + UniformIntRNG rng_int8{INT8_MIN >> 1, INT8_MAX >> 1}; + UniformIntRNG rng_fp32{-10, 10}; + if (is_int8) { + checker.set_dtype(0, dtype::QuantizedS8(1.1f)); + checker.set_rng(0, &rng_int8); + } else { + checker.set_rng(0, &rng_fp32); + } + for (auto arg : args) { + checker.set_param(arg.first).exec(arg.second); + } +} +} // namespace + +TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_NCHW44_FP32) { + for (auto filter : {2, 3, 4, 5}) + for (auto stride : {1, 2}) { + run_pooling_check(handle(), get_nchw44_pool_args(filter, stride), false); + } +} + +TEST_F(FALLBACK, POOLING_GI) { + using Param = param::Pooling; + // clang-format off + for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + // clang-format on +} + +TEST_F(FALLBACK, POOLING_GI_RECORD) { + using Param = param::Pooling; + TaskRecordChecker checker(0); + // clang-format off + for (size_t ih: {2, 3, 5, 7, 11, 13, 17}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p: {1, 2}) + { + Param param; + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } + // clang-format on +} + +TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_RECORD) { + using Param = param::Pooling; + TaskRecordChecker checker(0); + for (size_t ih : {2, 3, 5, 7, 11, 13, 17}) + for (size_t iw : {2, 3, 5, 7, 11, 13, 17}) + for (size_t p : {1, 2}) { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } +} + +TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_W9_w13_NCHW44) { + UniformIntRNG rng{-10, 10}; + Checker checker(handle()); + checker.set_rng(0, &rng); + // clang-format off + for (size_t ih: {20, 15}) + for (size_t iw: {15, 20}) + for (size_t kernel: {9, 13}) + for (size_t pad: {4, 6}) + for(auto mode: {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE}) + if (kernel > pad) + { + param::Pooling param; + param.mode = mode; + param.format = param::Pooling::Format::NCHW44; + param.pad_h = pad; + param.pad_w = pad; + param.stride_h = param.stride_w = 1; + param.window_h = param.window_w = kernel ; + checker.set_param(param).exec(TensorShapeArray{{2, 8, ih, iw, 4}, {}}); + } + // clang-format on +} + +TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_FALLBACK) { + using Param = param::Pooling; + for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p : {1, 2}) { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } +} + +TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI) { + using Param = param::Pooling; + for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) + for (size_t p : {1, 2}) { + Param param; + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + Checker checker(handle()); + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::AVERAGE; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = p; + if (ih + p * 2 >= 5 && iw + p * 2 >= 5) + checker.set_param(param).exec({{2, 3, ih, iw}, {}}); + } +} + +#if MEGDNN_WITH_BENCHMARK +namespace { +void benchmark_nchw44_fp32(Handle* handle) { + using Param = param::Pooling; + auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter, size_t stride, + size_t pad, Param::Mode mode) { + Param param; + param.window_h = param.window_w = filter; + param.stride_h = param.stride_w = stride; + param.pad_h = param.pad_w = pad; + param.format = Param::Format::NCHW; + param.mode = mode; + TensorShape nchw_shape = {n, c, h, w}; + TensorShape nchw44_shape = {n, c / 4, h, w, 4}; + TensorLayout dst_layout; + auto opr = handle->create_operator(); + opr->param() = param; + opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout); + float calc_amount = + dst_layout.total_nr_elems() * param.window_h * param.window_w; + + Benchmarker benchmarker_float_nchw(handle); + Benchmarker benchmarker_float_nchw44(handle); + Benchmarker benchmarker_int_nchw44(handle); + size_t RUN = 500; + auto t1 = benchmarker_float_nchw.set_display(false) + .set_times(RUN) + .set_param(param) + .exec({nchw_shape, {}}); + + param.format = Param::Format::NCHW44; + auto t2 = benchmarker_int_nchw44.set_display(false) + .set_times(RUN) + .set_param(param) + .execl({{nchw44_shape, dtype::QuantizedS8(1.0)}, + {{}, dtype::QuantizedS8(1.0)}}); + auto t3 = benchmarker_float_nchw44.set_display(false) + .set_times(RUN) + .set_param(param) + .exec({nchw44_shape, {}}); + + printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n" + "nchw_fp32={%.3f ms, %.3f Mflops}, " + "nchw44_int={%.3f ms, %.3f Mflops}, " + "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n", + n, c, h, w, filter, stride, pad, t1 / RUN, + calc_amount / (t1 / RUN * 1000), t2 / RUN, + calc_amount / (t2 / RUN * 1000), t3 / RUN, + calc_amount / (t3 / RUN * 1000), t1 / t3); + }; + // Resnet50 + run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX); + run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE); + + // VGG16 + run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX); + run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX); + run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX); + run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX); + run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX); +} +} // namespace + +TEST_F(FALLBACK, BENCHMARK_POOLING_GI_NCHW44_FP32) { + benchmark_nchw44_fp32(handle()); +} + +TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44_FP32) { + benchmark_nchw44_fp32(handle()); +} +TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W4x4_S2x2) { + using Param = param::Pooling; + auto run = [&](const TensorShapeArray& shapes, Param param) { + std::cout << "N:" << shapes[0][0] << " " + << "IC:" << shapes[0][1] << " " + << "IH:" << shapes[0][2] << " " + << "IW:" << shapes[0][3] << std::endl; + auto handle_naive = create_cpu_handle(2); + Benchmarker benchmarker_naive(handle_naive.get()); + Benchmarker benchmarker_float(handle()); + size_t RUN = 10; + auto t1 = benchmarker_naive.set_display(false) + .set_times(RUN) + .set_param(param) + .exec(shapes); + auto t2 = benchmarker_float.set_display(false) + .set_times(RUN) + .set_param(param) + .exec(shapes); + TensorLayout dst_layout; + auto opr = handle()->create_operator(); + opr->param() = param; + opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout); + float calc_amount = + dst_layout.total_nr_elems() * param.window_h * param.window_w; + printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN, + calc_amount / (t1 / RUN * 1000), t2 / RUN, + calc_amount / (t2 / RUN * 1000)); + }; + Param param; + param.window_h = param.window_w = 4; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + std::cout << "4x4 with 2x2 stride max pooling:" << std::endl; + run({{1, 24, 160, 128}, {}}, param); + run({{1, 4, 240, 135}, {}}, param); + run({{1, 32, 120, 67}, {}}, param); + run({{1, 64, 60, 33}, {}}, param); +} + +TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W5x5_S2x2) { + using Param = param::Pooling; + auto run = [&](const TensorShapeArray& shapes, Param param) { + std::cout << "N:" << shapes[0][0] << " " + << "IC:" << shapes[0][1] << " " + << "IH:" << shapes[0][2] << " " + << "IW:" << shapes[0][3] << std::endl; + auto handle_naive = create_cpu_handle(2); + Benchmarker benchmarker_naive(handle_naive.get()); + Benchmarker benchmarker_float(handle()); + size_t RUN = 10; + auto t1 = benchmarker_naive.set_display(false) + .set_times(RUN) + .set_param(param) + .exec(shapes); + auto t2 = benchmarker_float.set_display(false) + .set_times(RUN) + .set_param(param) + .exec(shapes); + TensorLayout dst_layout; + auto opr = handle()->create_operator(); + opr->param() = param; + opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout); + float calc_amount = + dst_layout.total_nr_elems() * param.window_h * param.window_w; + printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN, + calc_amount / (t1 / RUN * 1000), t2 / RUN, + calc_amount / (t2 / RUN * 1000)); + }; + Param param; + param.window_h = param.window_w = 5; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + std::cout << "5x5 with 2x2 stride max pooling:" << std::endl; + run({{1, 24, 160, 128}, {}}, param); + run({{1, 4, 240, 135}, {}}, param); + run({{1, 32, 120, 67}, {}}, param); + run({{1, 64, 60, 33}, {}}, param); +} +namespace { +template +void benchmark_impl( + const typename Opr::Param& param, std::vector> shapes, + size_t RUNS, TaskExecutorConfig&& multi_thread_config, + TaskExecutorConfig&& single_thread_config, DType data_type) { + std::vector multi_thread_times, single_thread_times; + { + auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config); + auto benchmarker = Benchmarker(multi_thread_hanle.get()); + benchmarker.set_times(RUNS).set_display(false).set_param(param); + benchmarker.set_dtype(0, data_type); + for (auto shape : shapes) { + multi_thread_times.push_back(benchmarker.exec(shape) / RUNS); + } + } + { + auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config); + auto benchmarker = Benchmarker(single_thread_handle.get()); + benchmarker.set_times(RUNS).set_display(false).set_param(param); + benchmarker.set_dtype(0, data_type); + for (auto shape : shapes) { + single_thread_times.push_back(benchmarker.exec(shape) / RUNS); + } + } + printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread); + printf("core_ids:"); + for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) { + printf("%zu ", multi_thread_config.affinity_core_set[i]); + } + printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]); + for (size_t i = 0; i < shapes.size(); i++) { + auto shape = shapes[i]; + printf("Case: "); + for (auto sh : shape) + printf("%s ", sh.to_string().c_str()); + printf("%zu threads time: %f,\n single thread time: " + "%f. spead up = %f, speedup/cores=%f\n", + multi_thread_config.nr_thread, multi_thread_times[i], + single_thread_times[i], single_thread_times[i] / multi_thread_times[i], + single_thread_times[i] / multi_thread_times[i] / + multi_thread_config.nr_thread); + } +} +} // namespace + +TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI) { + constexpr size_t RUNS = 50; + + using Param = param::Pooling; + Param param; + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + + std::vector> shapes; + + shapes.push_back({{32, 32, 215, 215}, {}}); + shapes.push_back({{32, 32, 128, 128}, {}}); + shapes.push_back({{8, 256, 100, 100}, {}}); + shapes.push_back({{1, 256, 100, 100}, {}}); + shapes.push_back({{1, 32, 100, 100}, {}}); + shapes.push_back({{1, 256, 80, 80}, {}}); + shapes.push_back({{1, 256, 60, 60}, {}}); + shapes.push_back({{1, 256, 30, 30}, {}}); + + param.window_h = param.window_w = 3; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 1; + printf("Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", param.window_h, + param.window_w, param.stride_h, static_cast(param.mode)); + benchmark_impl( + param, shapes, RUNS, {4, {0, 1, 2, 3}}, {1, {0}}, dtype::Float32()); + benchmark_impl( + param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, dtype::Float32()); + benchmark_impl( + param, shapes, RUNS, {2, {0, 1}}, {1, {0}}, dtype::Float32()); +} + +TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44) { + constexpr size_t RUNS = 50; + + using Param = param::Pooling; + Param param; + param.pad_h = param.pad_w = 0; + param.mode = Param::Mode::MAX; + std::vector> shapes; + std::vector> filter_and_stride = { + {2, 1}, {2, 2}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, {5, 1}, {5, 2}}; + + for (auto mode : {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE}) { + for (auto filter : filter_and_stride) { + shapes.push_back({{1, 32 * 4, 215, 215}, {}}); + shapes.push_back({{1, 32 * 4, 128, 128}, {}}); + shapes.push_back({{1, 16 * 4, 56, 56}, {}}); + + param.mode = mode; + param.window_h = param.window_w = filter[0]; + param.stride_h = param.stride_w = filter[1]; + param.format = Param::Format::NCHW; + printf("NCHW Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", + param.window_h, param.window_h, param.stride_h, + static_cast(param.mode)); + benchmark_impl( + param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, + dtype::QuantizedS8(1.1f)); + shapes.clear(); + shapes.push_back({{1, 32, 215, 215, 4}, {}}); + shapes.push_back({{1, 32, 128, 128, 4}, {}}); + shapes.push_back({{1, 16, 56, 56, 4}, {}}); + + param.format = Param::Format::NCHW44; + printf("NCHW44 Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", + param.window_h, param.window_w, param.stride_h, + static_cast(param.mode)); + benchmark_impl( + param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, + dtype::QuantizedS8(1.1f)); + shapes.clear(); + } + } +} +#endif + +} // namespace test +} // namespace megdnn + // vim: syntax=cpp.doxygen -- GitLab