feat(fallback): move arm_common pooling f32 algo to fallback gi

GitOrigin-RevId: 1bddd6dc2c8219a85b61badabb66015969f2ae7f

feat(fallback): move arm_common pooling f32 algo to fallback gi
GitOrigin-RevId: 1bddd6dc2c8219a85b61badabb66015969f2ae7f
91aaafd5 · Megvii Engine Team · bde2efa3 · 91aaafd5 · 91aaafd5 · 91aaafd5
19 changed file
--- a/dnn/src/arm_common/pooling/algo.h
+++ b/dnn/src/arm_common/pooling/algo.h
@@ -12,7 +12,7 @@
 #pragma once
 #include "src/arm_common/pooling/opr_impl.h"
 #include "src/arm_common/pooling/pooling_helper.h"
-#include "src/common//utils.h"
+#include "src/common/utils.h"
 #include "src/naive/handle.h"

 namespace megdnn {
@@ -134,22 +134,15 @@ public:
    void exec(const PoolingKernParam& param) const override;
    MEGDNN_DECL_ALGO_TYPE(ARM_Filter5ModexStridexNCHW44)
 };
-class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase {
-public:
-    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
-    const char* name() const override {
-        return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44";
-    }
-    bool usable(const PoolingKernSizeParam& param) const override;
-    void exec(const PoolingKernParam& param) const override;
-    MEGDNN_DECL_ALGO_TYPE(ARM_Fp32ModexStridexNCHW44)
-};
+
 class PoolingImpl::AlgoFallback final : public AlgoBase {
 public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return "FALLBACK_POOLING"; }
    bool usable(const PoolingKernSizeParam&) const override { return true; }
-    void exec(const PoolingKernParam&) const override {}
+    void exec(const PoolingKernParam&) const override {
+        megdnn_assert(false, "code issue happened!!");
+    }
    MEGDNN_DECL_ALGO_TYPE(ARM_Fallback)
 };
 WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param);

--- a/dnn/src/arm_common/pooling/opr_impl.cpp
+++ b/dnn/src/arm_common/pooling/opr_impl.cpp
@@ -32,7 +32,6 @@ private:
    AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4;
    AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4;
    AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4;
-    AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44;
    AlgoFallback algo_fallback;

 public:
@@ -49,7 +48,6 @@ public:
        all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4);
        all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4);
        all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4);
-        all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44);
        all_algos.emplace_back(&algo_fallback);

        for (auto&& algo : all_algos) {
@@ -62,40 +60,6 @@ public:

 PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;

-PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param(
-        fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) {
-    auto safe_u32 = [](size_t v) -> uint32_t {
-        megdnn_assert(
-                v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v);
-        return v;
-    };
-    return {safe_u32(src.shape[0]),
-            safe_u32(src.shape[1]),
-            {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}},
-            {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}},
-            {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}},
-            {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}},
-            {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}},
-            src.dtype,
-            dst.dtype,
-            opr->handle(),
-            opr->param().format,
-            opr->param().mode};
-};
-
-PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param(
-        fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
-        _megdnn_workspace workspace) {
-    PoolingKernParam ret;
-    static_cast<PoolingKernSizeParam&>(ret) =
-            make_pooling_kern_szie_param(opr, src.layout, dst.layout);
-    ret.src_ptr = src.get_ref_ptr();
-    ret.dst_ptr = dst.get_ref_ptr();
-    ret.workspace_ptr = workspace.raw_ptr;
-    ret.workspace_size = workspace.size;
-    return ret;
-};
-
 size_t PoolingImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& dst) {
    TensorLayoutArray layouts{src, dst};

--- a/dnn/src/arm_common/pooling/opr_impl.h
+++ b/dnn/src/arm_common/pooling/opr_impl.h
@@ -19,6 +19,10 @@ namespace arm_common {

 class PoolingImpl final : public fallback::PoolingImpl {
 private:
+    //! TODO: remove
+    //! AlgoFilterxModexStride1/AlgoFilter2ModexStride2
+    //! AlgoFilter3AverageStride2/AlgoFilter4MaxStride2/AlgoFilter5MaxStride2
+    //! after imp gi with float16 and int8 support to dnn/src/fallback/pooling/opr_impl.h
    class AlgoFilterxModexStride1;
    class AlgoFilter2ModexStride2;
    class AlgoFilter3MaxStride2;
@@ -31,7 +35,6 @@ private:
    class AlgoFilter3ModexStridexNCHW44;
    class AlgoFilter4ModexStridexNCHW44;
    class AlgoFilter5ModexStridexNCHW44;
-    class AlgoFp32ModexStridexNCHW44;
    class AlgoFallback;
    class AlgoPack;
    static AlgoPack sm_algo_pack;
@@ -45,47 +48,10 @@ public:

    static size_t constexpr MAX_SPATIAL_DIM = 2;

-    struct PoolingKernSizeParam {
-        uint32_t n, ic;
-        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
-        std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride;
-        DType src_type, dst_type;
-        Handle* handle;
-        Param::Format format;
-        Mode mode;
-    };
-
-    struct PoolingKernParam : public PoolingKernSizeParam {
-        RefPtr src_ptr;
-        RefPtr dst_ptr;
-        void* workspace_ptr;
-        size_t workspace_size;
-
-        template <typename T>
-        const T* src() const {
-            src_type.assert_is_compatible_ctype<T>();
-            return static_cast<const T*>(src_ptr.get_ptr());
-        }
-
-        template <typename T>
-        T* dst() const {
-            dst_type.assert_is_compatible_ctype<T>();
-            return static_cast<T*>(dst_ptr.get_ptr());
-        }
-
-        template <typename T>
-        T* workspace() const {
-            return static_cast<T*>(workspace_ptr);
-        }
-    };
+    using PoolingKernSizeParam = fallback::PoolingImpl::PoolingKernSizeParam;

-    PoolingKernSizeParam make_pooling_kern_szie_param(
-            fallback::PoolingImpl* opr, const TensorLayout& src,
-            const TensorLayout& dst);
+    using PoolingKernParam = fallback::PoolingImpl::PoolingKernParam;

-    PoolingKernParam make_pooling_kern_param(
-            fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
-            _megdnn_workspace workspace);
    class AlgoBase : public detail::Algorithm {
    public:
        enum class AlgoType : uint32_t {

--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -1325,3 +1325,35 @@ GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) {
    return ___gi_vget_high_f32(a);
 #endif
 }
+
+GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) {
+#if defined(GI_NEON_INTRINSICS)
+    return vpadd_f32(a, b);
+#elif defined(GI_SSE2_INTRINSICS)
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1];
+    res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1];
+    return res;
+#else
+    float32x2_t res;
+    res[0] = a[0] + a[1];
+    res[1] = b[0] + b[1];
+    return res;
+#endif
+}
+
+GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
+#if defined(GI_NEON_INTRINSICS)
+    return vpmax_f32(a, b);
+#elif defined(GI_SSE2_INTRINSICS)
+    float32x2_t res;
+    res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]);
+    res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]);
+    return res;
+#else
+    float32x2_t res;
+    res[0] = MAX_NAN(a[0], a[1]);
+    res[1] = MAX_NAN(b[0], b[1]);
+    return res;
+#endif
+}
--- a/dnn/src/fallback/gi_intrinsic_helper.h
+++ b/dnn/src/fallback/gi_intrinsic_helper.h
+/**
+ * \file dnn/src/fallback/gi_intrinsic_helper.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "src/common/unroll_macro.h"
+#include "src/fallback/general_intrinsic/gi_float.h"
+namespace megdnn {
+namespace {
+
+template <
+        int weight_number, int base_offset, int ptr_step, int oc_block, typename Func,
+        typename T, typename T2, typename... XT>
+struct LoadHelper {
+    static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args);
+};
+
+#define WEIGHT_CB(step) \
+    src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
+
+#define LOAD_HELPER(step)                                                          \
+    template <                                                                     \
+            int base_offset, int ptr_step, typename Func, typename T, typename T2, \
+            typename... XT>                                                        \
+    struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> {        \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int, XT... args) {         \
+            UNROLL_CALL_RAW(step, WEIGHT_CB);                                      \
+        }                                                                          \
+    }
+
+LOAD_HELPER(1);
+LOAD_HELPER(2);
+LOAD_HELPER(3);
+LOAD_HELPER(4);
+LOAD_HELPER(5);
+LOAD_HELPER(6);
+LOAD_HELPER(7);
+LOAD_HELPER(8);
+LOAD_HELPER(9);
+LOAD_HELPER(10);
+LOAD_HELPER(11);
+LOAD_HELPER(12);
+LOAD_HELPER(13);
+LOAD_HELPER(14);
+LOAD_HELPER(15);
+LOAD_HELPER(16);
+
+#undef LOAD_HELPER
+#undef WEIGHT_CB
+
+///////////////////////////c_dim = 1/////////////////////////
+#define WEIGHT_CB(step) src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);
+
+#define LOAD_HELPER(step)                                                            \
+    template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \
+    struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> {                 \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int) {                       \
+            UNROLL_CALL_RAW(step, WEIGHT_CB);                                        \
+        }                                                                            \
+    }
+
+LOAD_HELPER(1);
+LOAD_HELPER(2);
+LOAD_HELPER(3);
+LOAD_HELPER(4);
+LOAD_HELPER(5);
+LOAD_HELPER(6);
+LOAD_HELPER(7);
+LOAD_HELPER(8);
+LOAD_HELPER(9);
+
+#undef LOAD_HELPER
+#undef WEIGHT_CB
+
+/////////////////////////c_dim = 2///////////////////////////////
+#define WEIGHT_CB(step)                                             \
+    src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
+    src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);
+
+#define LOAD_HELPER(step)                                                            \
+    template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \
+    struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> {                 \
+        static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) {             \
+            UNROLL_CALL_RAW(step, WEIGHT_CB);                                        \
+        }                                                                            \
+    }
+
+LOAD_HELPER(1);
+LOAD_HELPER(2);
+LOAD_HELPER(3);
+LOAD_HELPER(4);
+LOAD_HELPER(5);
+LOAD_HELPER(6);
+LOAD_HELPER(7);
+LOAD_HELPER(8);
+
+#undef LOAD_HELPER
+#undef WEIGHT_CB
+
+template <
+        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
+        typename T, typename T2>
+GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) {
+    LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl(
+            weight, ptr, oc_offset);
+}
+
+template <
+        int weight_number, int base_offset, int ptr_step, int c_dim, typename Func,
+        typename T, typename T2, typename... XT>
+GI_FORCEINLINE void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) {
+    LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2, XT...>::impl(
+            weight, ptr, oc_offset, args...);
+}
+
+}  // namespace
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/algo.cpp
+++ b/dnn/src/fallback/pooling/gi/algo.cpp
--- a/dnn/src/fallback/pooling/gi/algo.h
+++ b/dnn/src/fallback/pooling/gi/algo.h
+/**
+ * \file dnn/src/fallback/pooling/gi/algo.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "src/common/utils.h"
+#include "src/fallback/pooling/opr_impl.h"
+
+#include "pooling_helper.h"
+
+#include "src/naive/handle.h"
+#include "src/naive/pooling/opr_impl.h"
+
+namespace megdnn {
+namespace fallback {
+
+using AlgoBase = PoolingImpl::AlgoBase;
+
+class PoolingImpl::AlgoGiFilterxModexStride1 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_STRIDE1"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_FilterxModexStride1)
+};
+
+class PoolingImpl::AlgoGiFilter2ModexStride2 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_STRIDE2"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Filter2ModexStride2)
+};
+class PoolingImpl::AlgoGiFilter3MaxStride2 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_FILTER3_MAX"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Filter3MaxStride2)
+};
+
+class PoolingImpl::AlgoGiFilter3AverageStride2 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_FILTER3_AVERAGE"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Filter3AverageStride2)
+};
+
+class PoolingImpl::AlgoGiFilter4MaxStride2 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_FILTER4_MAX"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Filter4MaxStride2)
+};
+
+class PoolingImpl::AlgoGiFilter5MaxStride2 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_FILTER5_MAX"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Filter5MaxStride2)
+};
+
+class PoolingImpl::AlgoGiFp32ModexStridexNCHW44 final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "GI_POOLING_FP32_MODEX_STRIDEX_NCHW44"; }
+    bool usable(const PoolingKernSizeParam& param) const override;
+    void exec(const PoolingKernParam& param) const override;
+    MEGDNN_DECL_ALGO_TYPE(GI_Fp32ModexStridexNCHW44)
+};
+
+class PoolingImpl::AlgoFallback final : public AlgoBase {
+public:
+    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
+    const char* name() const override { return "FALLBACK_NOT_GI_POOLING"; }
+    bool usable(const PoolingKernSizeParam&) const override { return true; }
+    void exec(const PoolingKernParam& /*param*/) const override {
+        megdnn_assert(false, "code issue happened!!");
+    }
+    MEGDNN_DECL_ALGO_TYPE(FallbackNotGI)
+};
+WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam&);
+
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
+++ b/dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
 /**
- * \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
+ * \file dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -10,17 +10,17 @@
 * implied.
 */

+#include "algo.h"
+#include "kern_fp32_pooling_nchw44.h"
 #include "megdnn/opr_param_defs.h"
-#include "src/arm_common/pooling/algo.h"
-#include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h"

 #include "midout.h"

-MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44)
+MIDOUT_DECL(megdnn_fallback_fp32_pooling_nchw44)

 namespace megdnn {
-namespace arm_common {
-bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable(
+namespace fallback {
+bool PoolingImpl::AlgoGiFp32ModexStridexNCHW44::usable(
        const PoolingKernSizeParam& param) const {
    uint32_t sh = param.stride[0];
    uint32_t sw = param.stride[1];
@@ -37,7 +37,7 @@ bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable(
    return avaible && size_ok;
 }

-void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
+void PoolingImpl::AlgoGiFp32ModexStridexNCHW44::exec(
        const PoolingKernParam& param) const {
    int ih = param.isz[0];
    int iw = param.isz[1];
@@ -55,7 +55,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(

 #define DISPATCH_FUNC(filter, stride, mode)                                           \
    MIDOUT_BEGIN(                                                                     \
-            megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0),                      \
+            megdnn_fallback_fp32_pooling_nchw44, midout_iv(0),                        \
            midout_iv(#filter #stride #mode##_hash)) {                                \
        auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \
            const int c_idx = index;                                                  \
@@ -135,7 +135,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec(
 #undef DISPATCH_FUNC
 }

-}  // namespace arm_common
+}  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
+/**
+ * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/common/utils.h"
+
+#include <algorithm>
+#include <vector>
+#include "do_max_pooling_3x3_s2x2_float.h"
+#include "src/common/macro_helper.h"
+
+namespace megdnn {
+namespace fallback {
+
+#define GI_UZP(s0, s1, d0, d1)              \
+    do {                                    \
+        auto tmp__ = GiUzpqFloat32(s0, s1); \
+        d0 = tmp__.val[0];                  \
+        d1 = tmp__.val[1];                  \
+    } while (0)
+
+void do_max_pooling_3x3_s2x2_float_gi(
+        const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_,
+        size_t PH_, size_t PW_, const WorkspaceBundle& ws) {
+    int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_;
+    // cache[i] stores the answer of the i-th line after
+    // pooling along the W dimension.
+    float* cache[3] = {
+            static_cast<float*>(ws.get(0)), static_cast<float*>(ws.get(1)),
+            static_cast<float*>(ws.get(2))};
+    float* odd = static_cast<float*>(ws.get(3));
+    float* even = static_cast<float*>(ws.get(4));
+    int ih_next = 0;
+    // "good" area means we can use SIMD to accelerate.
+    auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) {
+        // x*2 - P >= 0; 2x >= P; x >= P/2
+        O_from = (P + 1) / 2;
+        // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2
+        O_to = (I + P - 3) / 2 + 1;
+        // we must have I >= 2 to ensure O_from <= O_to
+    };
+    int OW_from, OW_to;
+    get_good_area(IW, OW, PW, OW_from, OW_to);
+    auto process_cache = [&](int ih) {
+        const float* __restrict sptr = src + ih * IW;
+        auto tmp = cache[2];
+        cache[2] = cache[1];
+        cache[1] = cache[0];
+        cache[0] = tmp;
+        // cache 0 is used to store the current answer.
+        auto run_single = [&](int ow) {
+            int iw = ow * 2 - PW;
+            float res = std::numeric_limits<float>::lowest();
+            if (iw + 0 >= 0 && iw + 0 < IW) {
+                res = std::max(res, sptr[iw + 0]);
+            }
+            if (iw + 1 >= 0 && iw + 1 < IW) {
+                res = std::max(res, sptr[iw + 1]);
+            }
+            if (iw + 2 >= 0 && iw + 2 < IW) {
+                res = std::max(res, sptr[iw + 2]);
+            }
+            cache[0][ow] = res;
+        };
+        // build odd/even
+        int iw = 0;
+        int odd_offset = 0, even_offset = 0;
+
+        for (; iw + 2 * 4 <= IW; iw += 2 * 4) {
+            GI_FLOAT32_t s0, s1, d0, d1;
+            s0 = GiLoadFloat32(sptr + iw);
+            s1 = GiLoadFloat32(sptr + iw + 4);
+            GI_UZP(s0, s1, d0, d1);
+            GiStoreFloat32(even + even_offset, d0);
+            GiStoreFloat32(odd + odd_offset, d1);
+            even_offset += 4;
+            odd_offset += 4;
+        }
+        for (; iw < IW; ++iw) {
+            if (iw & 1)
+                odd[odd_offset++] = sptr[iw];
+            else
+                even[even_offset++] = sptr[iw];
+        }
+        int ow = 0;
+        for (; ow < OW_from; ++ow)
+            run_single(ow);
+        if (PW & 1) {
+            for (; ow + 4 <= OW_to; ow += 4) {
+                GI_FLOAT32_t d, s0, s1, s2;
+                s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1);
+                s1 = GiLoadFloat32(even + ow - (PW >> 1));
+                s2 = GiLoadFloat32(odd + ow - (PW >> 1));
+                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
+                GiStoreFloat32(cache[0] + ow, d);
+            }
+        } else {
+            for (; ow + 4 <= OW_to; ow += 4) {
+                GI_FLOAT32_t d, s0, s1, s2;
+                s0 = GiLoadFloat32(even + ow - (PW >> 1));
+                s1 = GiLoadFloat32(odd + ow - (PW >> 1));
+                s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1);
+                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
+                GiStoreFloat32(cache[0] + ow, d);
+            }
+        }
+        for (; ow < OW; ++ow)
+            run_single(ow);
+    };
+    for (int oh = 0; oh < OH; ++oh) {
+        float* __restrict dptr = dst + oh * OW;
+        int ih_from = std::min(IH, std::max(0, oh * 2 - PH));
+        int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3));
+        while (ih_next < ih_to) {
+            process_cache(ih_next++);
+        }
+        if (ih_to - ih_from == 3) {
+            int ow = 0;
+            for (; ow + 4 <= OW; ow += 4) {
+                GI_FLOAT32_t d, s0, s1, s2;
+                s0 = GiLoadFloat32(cache[0] + ow);
+                s1 = GiLoadFloat32(cache[1] + ow);
+                s2 = GiLoadFloat32(cache[2] + ow);
+                d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2);
+                GiStoreFloat32(dptr + ow, d);
+            }
+            for (; ow < OW; ++ow) {
+                dptr[ow] = std::max(std::max(cache[0][ow], cache[1][ow]), cache[2][ow]);
+            }
+        } else {
+            std::memcpy(dptr, cache[0], sizeof(float) * OW);
+            for (int i = 1; i < ih_to - ih_from; ++i) {
+                int ow = 0;
+                for (; ow + 4 <= OW; ow += 4) {
+                    GI_FLOAT32_t d, s;
+                    s = GiLoadFloat32(cache[i] + ow);
+                    d = GiLoadFloat32(dptr + ow);
+                    d = GiMaximumFloat32(d, s);
+                    GiStoreFloat32(dptr + ow, d);
+                }
+                for (; ow < OW; ++ow) {
+                    dptr[ow] = std::max(dptr[ow], cache[i][ow]);
+                }
+            }
+        }
+    }
+}
+
+}  // namespace fallback
+}  // namespace megdnn
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
+/**
+ * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "src/common/utils.h"
+
+#include "megdnn/arch.h"
+
+#include "src/fallback/general_intrinsic/gi_float.h"
+
+namespace megdnn {
+namespace fallback {
+
+void do_max_pooling_3x3_s2x2_float_gi(
+        const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_,
+        size_t PH_, size_t PW_, const WorkspaceBundle& ws);
+
+}  // namespace fallback
+}  // namespace megdnn
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
+/**
+ * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "do_max_pooling_w4x4_s2x2.h"
+#include "pooling_helper.h"
+
+namespace megdnn {
+namespace fallback {
+
+void do_max_pooling_w4x4_s2x2_float_gi(
+        const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH,
+        const int IW, const int OH, const int OW, const int PH, const int PW) {
+    const int window = 4;
+    const int stride = 2;
+    using Pooler = MaxPooler<16, dt_float32, float, float>;
+    int oh = 0;
+    for (; oh < OH && -PH + stride * oh < 0; ++oh) {
+        int ow = 0;
+        for (; ow < OW; ++ow) {
+            do_pxl_naive<Pooler, window>(
+                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
+                    stride);
+        }
+    }
+    for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) {
+        int ow = 0;
+        for (; ow < OW && -PW + stride * ow < 0; ++ow) {
+            do_pxl_naive<Pooler, window>(
+                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
+                    stride);
+        }
+        dt_float32 last_hf_res = -std::numeric_limits<dt_float32>::infinity();
+        int ih = -PH + stride * oh, iw = -PW + stride * ow;
+        if (-PW + stride * ow + window <= IW) {
+            GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw),
+                         i1 = GiLoadFloat32(src + (ih + 1) * IW + iw),
+                         i2 = GiLoadFloat32(src + (ih + 2) * IW + iw),
+                         i3 = GiLoadFloat32(src + (ih + 3) * IW + iw);
+            GI_FLOAT32_t sum0 = GiMaximumFloat32(
+                    GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3));
+            float32x2_t t =
+                    GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
+            dst[oh * OW + ow] =
+                    std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1));
+            last_hf_res = GiGetLaneFloat32(t, 1);
+            ow += 1;
+        }
+        for (; ow + 1 < OW && -PW + stride * (ow + 1) + window <= IW; ow += 2) {
+            iw = -PW + stride * (ow + 1);
+            GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw),
+                         i1 = GiLoadFloat32(src + (ih + 1) * IW + iw),
+                         i2 = GiLoadFloat32(src + (ih + 2) * IW + iw),
+                         i3 = GiLoadFloat32(src + (ih + 3) * IW + iw);
+            GI_FLOAT32_t sum0 = GiMaximumFloat32(
+                    GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3));
+            float32x2_t t =
+                    GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0));
+            dst[oh * OW + ow + 0] = std::max(GiGetLaneFloat32(t, 0), last_hf_res);
+            dst[oh * OW + ow + 1] =
+                    std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1));
+            last_hf_res = GiGetLaneFloat32(t, 1);
+        }
+        for (; ow < OW; ++ow) {
+            do_pxl_naive<Pooler, window>(
+                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
+                    stride);
+        }
+    }
+    for (; oh < OH; ++oh) {
+        int ow = 0;
+        for (; ow < OW; ++ow) {
+            do_pxl_naive<Pooler, window>(
+                    oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride,
+                    stride);
+        }
+    }
+}
+
+}  // namespace fallback
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
+++ b/dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
+/**
+ * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "src/fallback/pooling/opr_impl.h"
+
+namespace megdnn {
+namespace fallback {
+
+void do_max_pooling_w4x4_s2x2_float_gi(
+        const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH,
+        const int IW, const int OH, const int OW, const int PH, const int PW);
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
+++ b/dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
+/**
+ * \file dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include <limits>
+#include "megdnn/opr_param_defs.h"
+#include "src/common/unroll_macro.h"
+#include "src/fallback/general_intrinsic/gi_float.h"
+#include "src/fallback/gi_intrinsic_helper.h"
+
+namespace megdnn {
+namespace fallback {
+namespace {
+
+template <
+        int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1,
+        typename T2>
+struct CalXsXNchw44 {
+    static void impl(T1 result, T2 src);
+};
+
+struct GiD1Qf32 {
+    static GI_FORCEINLINE GI_FLOAT32_t impl(const float32_t* ptr) {
+        return GiLoadFloat32(ptr);
+    }
+};
+
+template <
+        int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1,
+        typename T2>
+void calculate_xsx_nchw44(T1 result, T2 src) {
+    CalXsXNchw44<filter, stride, ow_step, mode, T1, T2>::impl(result, src);
+};
+
+#define CALCULATE_MAX_CB(step)                                       \
+    result[0] = GiMaximumFloat32(result[0], src[0 * stride + step]); \
+    result[1] = GiMaximumFloat32(result[1], src[1 * stride + step]); \
+    result[2] = GiMaximumFloat32(result[2], src[2 * stride + step]); \
+    result[3] = GiMaximumFloat32(result[3], src[3 * stride + step]);
+
+#define CALCULATE_AVG_CB(step)                                   \
+    result[0] = GiAddFloat32(result[0], src[0 * stride + step]); \
+    result[1] = GiAddFloat32(result[1], src[1 * stride + step]); \
+    result[2] = GiAddFloat32(result[2], src[2 * stride + step]); \
+    result[3] = GiAddFloat32(result[3], src[3 * stride + step]);
+
+#define INSTANCE_CAL(filter)                                                     \
+    template <int stride, typename T1, typename T2>                              \
+    struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> {     \
+        static void impl(T1 result, T2 src) {                                    \
+            UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB);                           \
+        }                                                                        \
+    };                                                                           \
+    template <int stride, typename T1, typename T2>                              \
+    struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, T2> { \
+        static void impl(T1 result, T2 src) {                                    \
+            UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB);                           \
+        }                                                                        \
+    };
+
+INSTANCE_CAL(2)
+INSTANCE_CAL(3)
+INSTANCE_CAL(4)
+INSTANCE_CAL(5)
+INSTANCE_CAL(9)
+INSTANCE_CAL(13)
+
+#undef INSTANCE_CAL
+#undef CALCULATE_AVG_CB
+#undef CALCULATE_MAX_CB
+
+template <int filter, int stride, int ow_step, PoolingBase::Mode mode>
+struct KerPoolingFilterXStrideXNchw44 {
+    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw);
+};
+
+template <int filter, int stride, int ow_step>
+struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, PoolingBase::Mode::MAX> {
+    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
+        constexpr int src_reg_size = ow_step * stride + filter - stride;
+        constexpr int packed_ic = 4;
+        constexpr int simd_len = 4;
+        constexpr float default_float = std::numeric_limits<float>::lowest();
+        GI_FLOAT32_t result[ow_step];
+        GI_FLOAT32_t src[src_reg_size];
+
+        result[0] = GiBroadcastFloat32(default_float);
+        result[1] = GiBroadcastFloat32(default_float);
+        result[2] = GiBroadcastFloat32(default_float);
+        result[3] = GiBroadcastFloat32(default_float);
+
+        for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
+            load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>(
+                    src, src_ptr + fh_idx * iw * packed_ic, 0);
+            calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::MAX>(
+                    result, src);
+        }
+
+        GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]);
+        GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]);
+        GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]);
+        GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]);
+    }
+};
+
+template <int filter, int stride, int ow_step>
+struct KerPoolingFilterXStrideXNchw44<
+        filter, stride, ow_step, PoolingBase::Mode::AVERAGE> {
+    static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) {
+        constexpr int src_reg_size = ow_step * stride + filter - stride;
+        constexpr int packed_ic = 4;
+        constexpr int simd_len = 4;
+        constexpr float default_float = 0;
+        constexpr float div_filter_size = 1.f / (filter * filter);
+        const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size);
+        GI_FLOAT32_t result[ow_step];
+        GI_FLOAT32_t src[src_reg_size];
+
+        result[0] = GiBroadcastFloat32(default_float);
+        result[1] = GiBroadcastFloat32(default_float);
+        result[2] = GiBroadcastFloat32(default_float);
+        result[3] = GiBroadcastFloat32(default_float);
+
+        for (int fh_idx = 0; fh_idx < filter; ++fh_idx) {
+            load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>(
+                    src, src_ptr + fh_idx * iw * packed_ic, 0);
+            calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::AVERAGE>(
+                    result, src);
+        }
+        result[0] = GiMultiplyFloat32(result[0], div_filter_size_vec);
+        result[1] = GiMultiplyFloat32(result[1], div_filter_size_vec);
+        result[2] = GiMultiplyFloat32(result[2], div_filter_size_vec);
+        result[3] = GiMultiplyFloat32(result[3], div_filter_size_vec);
+        GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]);
+        GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]);
+        GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]);
+        GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]);
+    }
+};
+
+template <PoolingBase::Mode mode>
+void ker_pooling_nchw44_remain_pad(
+        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
+        const int pad_bottom, const int pad_left, const int pad_right,
+        const int filter);
+template <>
+void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::MAX>(
+        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
+        const int pad_bottom, const int pad_left, const int pad_right,
+        const int filter) {
+    constexpr int ic_step = 4;
+    const int ih_end = filter - pad_bottom;
+    const int iw_end = filter - pad_right;
+    GI_FLOAT32_t result = GiBroadcastFloat32(std::numeric_limits<float>::lowest());
+    for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
+        for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
+            GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step);
+            result = GiMaximumFloat32(result, src);
+        }
+        src_ptr += iw * ic_step;
+    }
+    GiStoreFloat32(dst_ptr, result);
+}
+
+template <>
+void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::AVERAGE>(
+        const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top,
+        const int pad_bottom, const int pad_left, const int pad_right,
+        const int filter) {
+    constexpr int ic_step = 4;
+    const int ih_end = filter - pad_bottom;
+    const int iw_end = filter - pad_right;
+    const float div_filter_size = 1.f / (filter * filter);
+    const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size);
+    GI_FLOAT32_t result = GiBroadcastFloat32(0.f);
+
+    for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) {
+        for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) {
+            GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step);
+            result = GiAddFloat32(result, src);
+        }
+        src_ptr += iw * ic_step;
+    }
+    result = GiMultiplyFloat32(result, div_filter_size_vec);
+    GiStoreFloat32(dst_ptr, result);
+}
+
+template <PoolingBase::Mode mode>
+static inline void kern_pooling_with_pad_nchw44(
+        const float32_t* src, float32_t* dst, const int filter, const int ow_start,
+        const int ow_end, const int iw, const int ow, const int stride_w, const int pw,
+        const int real_ih_idx, const int oh_idx, const int pad_top,
+        const int pad_bottom) {
+    constexpr int ic_step = 4;
+    constexpr int oc_step = 4;
+    for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) {
+        const int iw_idx = ow_idx * stride_w;
+        const int real_iw_idx = std::max(iw_idx - pw, 0);
+        const int pad_left = std::max(0, pw - iw_idx);
+        const int pad_right = std::max(0, iw_idx - pw + filter - iw);
+        const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step;
+        const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
+        ker_pooling_nchw44_remain_pad<mode>(
+                src + src_offset, dst + dst_offset, iw, pad_top, pad_bottom, pad_left,
+                pad_right, filter);
+    }
+}
+
+template <int filter, int stride, PoolingBase::Mode mode>
+static inline void pooling_fp32_nchw44_pad(
+        const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph,
+        int pw) {
+    constexpr int stride_h = stride;
+    constexpr int stride_w = stride;
+    constexpr int ic_step = 4;
+    constexpr int oc_step = 4;
+    constexpr int ow_step = 4;
+    const int ow_pad_left_end = div_ceil(pw, stride_w);
+    const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w;
+    const int ow_pad_right_step_end =
+            (ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step + ow_pad_left_end;
+
+    rep(oh_idx, oh) {
+        const int ih_idx = oh_idx * stride_h;
+        const int real_ih_idx = std::max(ih_idx - ph, 0);
+        const int pad_top = std::max(0, ph - ih_idx);
+        const int pad_bottom = std::max(0, ih_idx - ph + filter - ih);
+        if (pad_top > 0 || pad_bottom > 0) {
+            kern_pooling_with_pad_nchw44<mode>(
+                    src, dst, filter, 0, ow, iw, ow, stride_w, pw, real_ih_idx, oh_idx,
+                    pad_top, pad_bottom);
+
+        } else {
+            kern_pooling_with_pad_nchw44<mode>(
+                    src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw,
+                    real_ih_idx, oh_idx, pad_top, pad_bottom);
+            for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end;
+                 ow_idx += ow_step) {
+                const int iw_idx = ow_idx * stride_w;
+                const int real_iw_idx = std::max(iw_idx - pw, 0);
+                const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step;
+                const int dst_offset = (oh_idx * ow + ow_idx) * oc_step;
+                KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl(
+                        src + src_offset, dst + dst_offset, iw);
+            }
+            kern_pooling_with_pad_nchw44<mode>(
+                    src, dst, filter, ow_pad_right_step_end, ow, iw, ow, stride_w, pw,
+                    real_ih_idx, oh_idx, pad_top, pad_bottom);
+        }
+    }
+}
+
+template <int filter, int stride, PoolingBase::Mode mode>
+static inline void pooling_fp32_nchw44_no_pad(
+        const float32_t* src, float32_t* dst, int, int iw, int oh, int ow) {
+    constexpr int stride_h = stride;
+    constexpr int stride_w = stride;
+    constexpr int ic_step = 4;
+    constexpr int oc_step = 4;
+    constexpr int ow_step = 4;
+    const int ow_end = ow / ow_step * ow_step;
+    const int ow_remain = ow - ow_end;
+
+    rep(oh_idx, oh) {
+        const int ih_idx = oh_idx * stride_h;
+        const int src_ih_offset = ih_idx * iw;
+        const int dst_oh_offset = oh_idx * ow;
+        for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) {
+            const int iw_idx = ow_idx * stride_w;
+            const int src_offset = (src_ih_offset + iw_idx) * ic_step;
+            const int dst_offset = (dst_oh_offset + ow_idx) * oc_step;
+            KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl(
+                    src + src_offset, dst + dst_offset, iw);
+        }
+        if (ow_remain > 0) {
+            kern_pooling_with_pad_nchw44<mode>(
+                    src, dst, filter, ow_end, ow, iw, ow, stride_w, 0, ih_idx, oh_idx,
+                    0, 0);
+        }
+    }
+}
+
+template <int filter, int stride, PoolingBase::Mode mode>
+static inline void pooling_fp32_nchw44(
+        const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph,
+        int pw) {
+    if (ph > 0 || pw > 0) {
+        pooling_fp32_nchw44_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow, ph, pw);
+    } else {
+        pooling_fp32_nchw44_no_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow);
+    }
+}
+
+}  // namespace
+}  // namespace fallback
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/gi/pooling_helper.h
+++ b/dnn/src/fallback/pooling/gi/pooling_helper.h
--- a/dnn/src/fallback/pooling/opr_impl.cpp
+++ b/dnn/src/fallback/pooling/opr_impl.cpp
@@ -6,18 +6,186 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */
 #include "src/fallback/pooling/opr_impl.h"
-
-#include <cstring>
-#include "src/common/utils.h"
-#include "src/naive/handle.h"
+#include "src/common/algo_chooser.h"
+#include "src/common/metahelper.h"
+#include "src/fallback/pooling/gi/algo.h"

 #include "midout.h"

 MIDOUT_DECL(megdnn_fallback_pooling)

+using namespace megdnn;
+using namespace fallback;
+
+class PoolingImpl::AlgoPack : NonCopyableObj {
+private:
+    AlgoBase::Mapper m_all_algos_map;
+    AlgoGiFilterxModexStride1 algo_gi_filterx_modex_stride1;
+    AlgoGiFilter2ModexStride2 algo_gi_filter2_modex_stride2;
+    AlgoGiFilter3MaxStride2 algo_gi_filter3_max_stride2;
+    AlgoGiFilter3AverageStride2 algo_gi_filter3_average_stride2;
+    AlgoGiFilter4MaxStride2 algo_gi_filter4_max_stride2;
+    AlgoGiFilter5MaxStride2 algo_gi_filter5_max_stride2;
+    AlgoGiFp32ModexStridexNCHW44 algo_gi_fp32_modex_stridex_nchw44;
+    AlgoFallback algo_fallback;
+
+public:
+    AlgoPack() {
+        all_algos.emplace_back(&algo_gi_filterx_modex_stride1);
+        all_algos.emplace_back(&algo_gi_filter2_modex_stride2);
+        all_algos.emplace_back(&algo_gi_filter3_max_stride2);
+        all_algos.emplace_back(&algo_gi_filter3_average_stride2);
+        all_algos.emplace_back(&algo_gi_filter4_max_stride2);
+        all_algos.emplace_back(&algo_gi_filter5_max_stride2);
+        all_algos.emplace_back(&algo_gi_fp32_modex_stridex_nchw44);
+        all_algos.emplace_back(&algo_fallback);
+
+        for (auto&& algo : all_algos) {
+            m_all_algos_map.emplace(algo->info().desc, algo);
+        }
+    }
+    SmallVector<AlgoBase*> all_algos;
+    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
+};
+
+PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;
+
+PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param(
+        fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) {
+    auto safe_u32 = [](size_t v) -> uint32_t {
+        megdnn_assert(
+                v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v);
+        return v;
+    };
+    return {safe_u32(src.shape[0]),
+            safe_u32(src.shape[1]),
+            {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}},
+            {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}},
+            {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}},
+            {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}},
+            {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}},
+            src.dtype,
+            dst.dtype,
+            opr->handle(),
+            opr->param().format,
+            opr->param().mode};
+};
+
+PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param(
+        fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
+        _megdnn_workspace workspace) {
+    PoolingKernParam ret;
+    static_cast<PoolingKernSizeParam&>(ret) =
+            make_pooling_kern_szie_param(opr, src.layout, dst.layout);
+    ret.src_ptr = src.get_ref_ptr();
+    ret.dst_ptr = dst.get_ref_ptr();
+    ret.workspace_ptr = workspace.raw_ptr;
+    ret.workspace_size = workspace.size;
+    return ret;
+};
+
+MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl);
+
+std::vector<Algorithm*> PoolingImpl::get_all_algorithms(
+        const TensorLayout& src, const TensorLayout& dst) {
+    auto param = make_pooling_kern_szie_param(this, src, dst);
+    std::vector<Algorithm*> ret;
+    ret.reserve(algo_pack().all_algos.size());
+    for (auto i : algo_pack().all_algos) {
+        if (i->usable(param)) {
+            ret.push_back(i);
+        }
+    }
+    return ret;
+}
+
+size_t PoolingImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    TensorLayoutArray layouts{src, dst};
+    AlgorithmCache::Key key{this->handle(), this->get_opr_type(),
+                            layouts.data(), layouts.size(),
+                            &this->param(), sizeof(this->param())};
+    auto rst = AlgorithmCache::instance().get(key);
+    if (rst.policy.algo.valid()) {
+        return rst.workspace;
+    }
+
+    auto param = make_pooling_kern_szie_param(this, src, dst);
+    auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic(
+            src, dst, std::numeric_limits<size_t>::max(), AlgoAttribute::DEFAULT,
+            AlgoAttribute::DEFAULT));
+    if (!is_fallback_non_gi_algo(algo)) {
+        size_t fallback_gi_workspace = 0;
+
+        //! When multi-thread, every thread has its own workspace
+        size_t nr_threads = static_cast<naive::HandleImpl*>(handle())
+                                    ->megcore_dispatcher()
+                                    ->nr_threads();
+        if (param.src_type.category() == DTypeCategory::FLOAT &&
+            param.filter[0] == param.filter[1] &&
+            (param.filter[0] == 3 || param.filter[0] == 5) &&
+            param.format == Param::Format::NCHW &&
+            (param.mode == Mode::MAX ||
+             (param.mode == Mode::AVERAGE && param.filter[0] == 3)) &&
+            param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 &&
+            param.isz[1] >= 2) {
+            WorkspaceBundle ws = get_bundle(param);
+            fallback_gi_workspace = ws.total_size_in_bytes() * nr_threads;
+        }
+
+        return fallback_gi_workspace;
+    } else {
+        auto naive_worksapce =
+                naive::PoolingForwardImpl::get_workspace_in_bytes(src, dst);
+        return naive_worksapce;
+    }
+}
+
+void PoolingImpl::exec(
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
+    check_exec(src.layout, dst.layout, workspace.size);
+    auto param = make_pooling_kern_param(this, src, dst, workspace);
+    auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic(
+            src.layout, dst.layout, std::numeric_limits<size_t>::max(),
+            AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT));
+    if (!is_fallback_non_gi_algo(algo)) {
+        algo->exec(param);
+    } else {
+        exec_fallback(src, dst, workspace);
+    }
+}
+
+std::vector<Algorithm*> PoolingImpl::get_all_algorithms_safe(
+        const TensorLayout& src, const TensorLayout& dst) {
+    auto ret_safe = get_all_algorithms(src, dst);
+    megdnn_assert(!ret_safe.empty(), "no usable pooling fwd algorithm");
+    return ret_safe;
+}
+
+Algorithm* PoolingImpl::get_algorithm_heuristic(
+        const TensorLayout& src, const TensorLayout& dst,
+        size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
+        const AlgoAttribute& negative_attr) {
+    MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
+
+    auto param = make_pooling_kern_szie_param(this, src, dst);
+    for (auto&& iter : sm_algo_pack.all_algos) {
+        if (iter->is_available_attribute(param, positive_attr, negative_attr)) {
+            return iter;
+        }
+    }
+    megdnn_throw(ssprintf(
+            "require algorithm with attribute(%s) and without "
+            "attribute(%s), but can't get suitable algo.\n",
+            Algorithm::attribute_str(positive_attr).c_str(),
+            Algorithm::attribute_str(negative_attr).c_str()));
+    return nullptr;
+}
+//! fallback not gi imp
 namespace megdnn {
 namespace fallback {
 namespace pooling {
@@ -140,9 +308,6 @@ void w2x2_s2x2_avg_int8(
 }  // namespace fallback
 }  // namespace megdnn

-namespace megdnn {
-namespace fallback {
-
 void PoolingImpl::exec_w3x3_s1x1(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param) {
    auto N = src.layout.shape[0], C = src.layout.shape[1];
@@ -179,7 +344,7 @@ void PoolingImpl::exec_w2x2_s2x2_avg_int8(
    }
 }

-void PoolingImpl::exec(
+void PoolingImpl::exec_fallback(
        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
    Param param = this->param();
    check_exec(src.layout, dst.layout, workspace.size);
@@ -219,7 +384,4 @@ void PoolingImpl::exec(
    naive::PoolingForwardImpl::exec(src, dst, workspace);
 }

-}  // namespace fallback
-}  // namespace megdnn
-
 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/pooling/opr_impl.h
+++ b/dnn/src/fallback/pooling/opr_impl.h
@@ -10,6 +10,7 @@
 * implied.
 */
 #pragma once
+#include <unordered_map>
 #include "megdnn/oprs/base.h"
 #include "src/naive/pooling/opr_impl.h"

@@ -17,19 +18,143 @@ namespace megdnn {
 namespace fallback {

 class PoolingImpl : public naive::PoolingForwardImpl {
+private:
+    class AlgoGiFilterxModexStride1;
+    class AlgoGiFilter2ModexStride2;
+    class AlgoGiFilter3MaxStride2;
+    class AlgoGiFilter3AverageStride2;
+    class AlgoGiFilter4MaxStride2;
+    class AlgoGiFilter5MaxStride2;
+    class AlgoGiFp32ModexStridexNCHW44;
+    class AlgoFallback;
+    class AlgoPack;
+    static AlgoPack sm_algo_pack;
+
+    void exec_w3x3_s1x1(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param);
+    void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
+    void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
+
 public:
    using naive::PoolingForwardImpl::PoolingForwardImpl;
    using Param = param::Pooling;
+
    void exec(
            _megdnn_tensor_in src, _megdnn_tensor_out dst,
            _megdnn_workspace workspace) override;

-private:
-    void exec_w3x3_s1x1(
-            _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param);
-    void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
-    void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst);
+    void exec_fallback(
+            _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace);
+
+    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override;
+
+    static size_t constexpr MAX_SPATIAL_DIM = 2;
+
+    struct PoolingKernSizeParam {
+        uint32_t n, ic;
+        std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
+        std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride;
+        DType src_type, dst_type;
+        Handle* handle;
+        Param::Format format;
+        Mode mode;
+    };
+
+    struct PoolingKernParam : public PoolingKernSizeParam {
+        RefPtr src_ptr;
+        RefPtr dst_ptr;
+        void* workspace_ptr;
+        size_t workspace_size;
+
+        template <typename T>
+        const T* src() const {
+            src_type.assert_is_compatible_ctype<T>();
+            return static_cast<const T*>(src_ptr.get_ptr());
+        }
+
+        template <typename T>
+        T* dst() const {
+            dst_type.assert_is_compatible_ctype<T>();
+            return static_cast<T*>(dst_ptr.get_ptr());
+        }
+
+        template <typename T>
+        T* workspace() const {
+            return static_cast<T*>(workspace_ptr);
+        }
+    };
+
+    PoolingKernSizeParam make_pooling_kern_szie_param(
+            fallback::PoolingImpl* opr, const TensorLayout& src,
+            const TensorLayout& dst);
+
+    PoolingKernParam make_pooling_kern_param(
+            fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst,
+            _megdnn_workspace workspace);
+    class AlgoBase : public detail::Algorithm {
+    public:
+        enum class AlgoType : uint32_t {
+            GI_FilterxModexStride1,
+            GI_Filter2ModexStride2,
+            GI_Filter3MaxStride2,
+            GI_Filter3AverageStride2,
+            GI_Filter4MaxStride2,
+            GI_Filter5MaxStride2,
+            GI_Filter2ModexStridexNCHW44,
+            GI_Filter3ModexStridexNCHW44,
+            GI_Filter4ModexStridexNCHW44,
+            GI_Filter5ModexStridexNCHW44,
+            GI_Fp32ModexStridexNCHW44,
+            FallbackNotGI
+        };
+
+        using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
+        AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; }
+        virtual ~AlgoBase() = default;
+        virtual bool usable(const PoolingKernSizeParam& param) const = 0;
+        virtual void exec(const PoolingKernParam& param) const = 0;
+
+        uint32_t type() const override { return INVALID_ALGO_TYPE; };
+        bool is_available_attribute(
+                const PoolingKernSizeParam& param,
+                const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
+                const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
+            return contain_attribute_all(positive_attr) &&
+                   !contain_attribute_any(negative_attr) && usable(param);
+        }
+    };
+
+    const char* get_algorithm_set_name() const override {
+        return "FALLBACK_POOLING_FORWARD";
+    }
+
+    Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override;
+
+    std::vector<Algorithm*> get_all_algorithms(
+            const TensorLayout& src, const TensorLayout& dst) override;
+    std::vector<Algorithm*> get_all_algorithms_safe(
+            const TensorLayout& src, const TensorLayout& dst) override;
+
+    Algorithm* get_algorithm_heuristic(
+            const TensorLayout& src, const TensorLayout& dst,
+            size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
+            const AlgoAttribute& negative_attr) override;
+
+    AlgorithmInfo get_algorithm_info_heuristic(
+            const TensorLayout& src, const TensorLayout& dst,
+            size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
+            const AlgoAttribute& negative_attr) {
+        return fallback::PoolingImpl::get_algorithm_heuristic(
+                       src, dst, workspace_limit_in_bytes, positive_attr, negative_attr)
+                ->info();
+    }
+
+    static const AlgoPack& algo_pack() { return sm_algo_pack; }
+    bool is_fallback_non_gi_algo(Algorithm* algo) {
+        return strcmp(algo->name(), "FALLBACK_NOT_GI_POOLING") == 0;
+    }
 };
 }  // namespace fallback
 }  // namespace megdnn
+
 // vim: syntax=cpp.doxygen
--- a/dnn/src/x86/pooling/algo.h
+++ b/dnn/src/x86/pooling/algo.h
@@ -103,7 +103,9 @@ public:
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; };
    const char* name() const override { return m_algo_name.c_str(); }
    bool is_available(const SizeArgs&) const override { return true; }
-    void exec(const ExecArgs&) const override {}
+    void exec(const ExecArgs&) const override {
+        megdnn_assert(false, "code issue happened!!");
+    }
    MEGDNN_DECL_ALGO_TYPE(X86_Fallback)
 };


--- a/dnn/test/fallback/gi.cpp
+++ b/dnn/test/fallback/gi.cpp
@@ -3161,6 +3161,44 @@ TEST_F(FALLBACK, GiGetHighFloat32) {
    ASSERT_EQ(*(r + 1), s0[3]);
 }

+TEST_F(FALLBACK, GiPaddFloat32) {
+    float32x2_t src0, src1, ret;
+    std::vector<float> s0{1.1f, -3.1415f};
+    std::vector<float> s1{2.3f, 3.14777f};
+    memcpy(&src0, s0.data(), sizeof(float32x2_t));
+    memcpy(&src1, s1.data(), sizeof(float32x2_t));
+
+    ret = GiPaddFloat32(src0, src1);
+
+    std::vector<float> naive;
+    naive.push_back(s0[0] + s0[1]);
+    naive.push_back(s1[0] + s1[1]);
+
+    auto r = (float*)&ret;
+    ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3);
+    ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3);
+}
+
+TEST_F(FALLBACK, GiPmaxFloat32) {
+    float32x2_t src0, src1, ret;
+    std::vector<float> s0{1.1f, -3.1415f};
+    std::vector<float> s1{2.3f, 3.14777f};
+    memcpy(&src0, s0.data(), sizeof(float32x2_t));
+    memcpy(&src1, s1.data(), sizeof(float32x2_t));
+
+    ret = GiPmaxFloat32(src0, src1);
+
+    std::vector<float> naive;
+    auto t0 = MAX_NAN(s0[0], s0[1]);
+    auto t1 = MAX_NAN(s1[0], s1[1]);
+    naive.push_back(t0);
+    naive.push_back(t1);
+
+    auto r = (float*)&ret;
+    ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3);
+    ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3);
+}
+
 }  // namespace test
 }  // namespace megdnn


--- a/dnn/test/fallback/pooling.cpp
+++ b/dnn/test/fallback/pooling.cpp