perf(dnn/fallback): optimize some corner case in reduce

GitOrigin-RevId: 1185594301dfb58d07b8b92212a8c7881fb204de

perf(dnn/fallback): optimize some corner case in reduce
GitOrigin-RevId: 1185594301dfb58d07b8b92212a8c7881fb204de
f12b75c0 · Megvii Engine Team · 35cf0422 · f12b75c0 · f12b75c0 · f12b75c0
13 changed file
--- a/dnn/src/arm_common/reduce/opr_impl.cpp
+++ b/dnn/src/arm_common/reduce/opr_impl.cpp
@@ -97,6 +97,7 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
    }
    MeanReducer() = default;
    void feed(const ctype* val) { res = vaddq_f16(vld1q_f16(val), res); }
+    void feed_vector(const float16x8_t& vval) { res = vaddq_f16(vval, res); }
    void feed_remain(const ctype* val) { remain += *val; }
    void post(ctype* dst) {
        res = vmulq_n_f16(res, coef);
@@ -127,8 +128,8 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, false> {
        vcoef = vdupq_n_f32(coef);
    }
    MeanReducer() = default;
-    void feed(const uint8_t* val) {
-        const uint8x16_t vval = vld1q_u8(val);
+    void feed(const uint8_t* val) { feed_vector(vld1q_u8(val)); }
+    void feed_vector(const uint8x16_t& vval) {
        const uint16x8_t vval_low = vmovl_u8(vget_low_u8(vval));
        const uint16x8_t vval_high = vmovl_u8(vget_high_u8(vval));

@@ -219,8 +220,8 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
            remain = vdupq_n_##_stype(_init);                                        \
        }                                                                            \
        _mode##Reducer() = default;                                                  \
-        void feed(const ctype* val) {                                                \
-            __stype##8x16_t vval = vld1q_##_stype(val);                              \
+        void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); }            \
+        void inline feed_vector(const __stype##8x16_t & vval) {                      \
            res = v##_mode##q_##_stype(vval, res);                                   \
        }                                                                            \
        void feed_remain(const ctype* val) {                                         \
@@ -286,8 +287,8 @@ REDUCER_MAX_MIN_C1(
            remain = _init;                                                  \
        }                                                                    \
        _mode##Reducer() = default;                                          \
-        void feed(const ctype* val) {                                        \
-            __stype vval = vld1q_##_stype(val);                              \
+        void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); }    \
+        void inline feed_vector(const __stype& vval) {                       \
            res = v##_mode##q_##_stype(vval, res);                           \
        }                                                                    \
        void feed_remain(const ctype* val) {                                 \
@@ -326,8 +327,8 @@ struct ProductReducer;
            remain = _init;                                                         \
        }                                                                           \
        _mode##Reducer() = default;                                                 \
-        void feed(const ctype* val) {                                               \
-            __stype vval = vld1q_##_stype(val);                                     \
+        void feed(const ctype* val) { feed_vector(vld1q_##_stype(val)); }           \
+        void feed_vector(const __stype& vval) {                                     \
            res = v##_act##q_##_stype(vval, res);                                   \
        }                                                                           \
        void feed_remain(const ctype* val) {                                        \
@@ -435,8 +436,8 @@ struct SumSqrReducer<__fp16, __fp16, __fp16, false> {
    fp16_fix_t remain;
    SumSqrReducer(DType, size_t cnt) : remain(0.0f) { res = vdupq_n_f16(0.0f); }
    SumSqrReducer() = default;
-    void feed(const __fp16* val) {
-        float16x8_t vval = vld1q_f16(val);
+    void feed(const __fp16* val) { feed_vector(vld1q_f16(val)); }
+    void inline feed_vector(const float16x8_t& vval) {
        res = vaddq_f16(vmulq_f16(vval, vval), res);
    }
    void feed_remain(const __fp16* val) { remain += (*val) * (*val); }
@@ -504,6 +505,60 @@ struct Exec<Reducer, false> {
    }
 };

+template <typename Reducer, typename dtype, size_t B>
+struct ExecC1SmallB {
+    static void do_reduce(
+            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
+};
+
+#define ImplementC1SmallB(_ctype, _simd_prefix, _simd_suffix)                      \
+    template <typename Reducer, size_t B>                                          \
+    struct ExecC1SmallB<Reducer, _ctype, B> {                                      \
+        static void do_reduce(                                                     \
+                const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
+                size_t) {                                                          \
+            size_t a = 0;                                                          \
+            for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) {        \
+                Reducer reducer(src_dtype, B);                                     \
+                auto src_ptr = src + a * B;                                        \
+                if (B == 4) {                                                      \
+                    _simd_prefix##x4_t data_v4 = vld4q_##_simd_suffix(src_ptr);    \
+                    reducer.feed_vector(data_v4.val[0]);                           \
+                    reducer.feed_vector(data_v4.val[1]);                           \
+                    reducer.feed_vector(data_v4.val[2]);                           \
+                    reducer.feed_vector(data_v4.val[3]);                           \
+                }                                                                  \
+                if (B == 3) {                                                      \
+                    _simd_prefix##x3_t data_v3 = vld3q_##_simd_suffix(src_ptr);    \
+                    reducer.feed_vector(data_v3.val[0]);                           \
+                    reducer.feed_vector(data_v3.val[1]);                           \
+                    reducer.feed_vector(data_v3.val[2]);                           \
+                }                                                                  \
+                if (B == 2) {                                                      \
+                    _simd_prefix##x2_t data_v2 = vld2q_##_simd_suffix(src_ptr);    \
+                    reducer.feed_vector(data_v2.val[0]);                           \
+                    reducer.feed_vector(data_v2.val[1]);                           \
+                }                                                                  \
+                reducer.post(dst);                                                 \
+                dst += Reducer::SIMD_WIDTH;                                        \
+            }                                                                      \
+            for (; a < A; a++) {                                                   \
+                Reducer reducer(src_dtype, B);                                     \
+                auto src_ptr = src + a * B;                                        \
+                for (size_t i = 0; i < B; i++)                                     \
+                    reducer.feed_remain(src_ptr + i);                              \
+                reducer.post_remain(dst);                                          \
+                dst++;                                                             \
+            }                                                                      \
+        }                                                                          \
+    }
+
+ImplementC1SmallB(uint8_t, uint8x16, u8);
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ImplementC1SmallB(__fp16, float16x8, f16);
+#endif
+
 }  // anonymous namespace

 void ReduceImpl::exec(
@@ -513,31 +568,40 @@ void ReduceImpl::exec(
    reduce::get_ABC(src.layout, A, B, C, param().axis);
    bool execed = false;
    using Mode = param::Reduce::Mode;
-#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                            \
-    if (C == 1) {                                                                  \
-        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                   \
-        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>   \
-                do_reduce = Exec<_Reducer, true>::do_reduce;                       \
-        MIDOUT_BEGIN(                                                              \
-                megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
-            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                                \
-                    reinterpret_cast<ctype*>(src.raw_ptr()),                       \
-                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C));  \
-            execed = true;                                                         \
-        }                                                                          \
-        MIDOUT_END();                                                              \
-    } else {                                                                       \
-        using _Reducer = Reducer<dtype, ctype, comp_type, false>;                  \
-        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>   \
-                do_reduce = Exec<_Reducer, false>::do_reduce;                      \
-        MIDOUT_BEGIN(                                                              \
-                megdnn_arm_common_reduce, ctype, dtype, comp_type, midout_iv(1)) { \
-            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                                \
-                    reinterpret_cast<ctype*>(src.raw_ptr()),                       \
-                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C));  \
-            execed = true;                                                         \
-        }                                                                          \
-        MIDOUT_END();                                                              \
+#define DISPATCH_FUNC(Reducer, _dtype, ctype, comp_type)                            \
+    if (C == 1) {                                                                   \
+        using _Reducer = Reducer<_dtype, ctype, comp_type, true>;                   \
+        using _ReducerC1SmallB = Reducer<_dtype, ctype, comp_type, false>;          \
+        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>    \
+                do_reduce = Exec<_Reducer, true>::do_reduce;                        \
+        if (src.layout.dtype.category() != DTypeCategory::FLOAT) {                  \
+            if (B == 2)                                                             \
+                do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;    \
+            if (B == 3)                                                             \
+                do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;    \
+            if (B == 4)                                                             \
+                do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;    \
+        }                                                                           \
+        MIDOUT_BEGIN(                                                               \
+                megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                                 \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                        \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C));   \
+            execed = true;                                                          \
+        }                                                                           \
+        MIDOUT_END();                                                               \
+    } else {                                                                        \
+        using _Reducer = Reducer<_dtype, ctype, comp_type, false>;                  \
+        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>    \
+                do_reduce = Exec<_Reducer, false>::do_reduce;                       \
+        MIDOUT_BEGIN(                                                               \
+                megdnn_arm_common_reduce, ctype, _dtype, comp_type, midout_iv(1)) { \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                                 \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                        \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C));   \
+            execed = true;                                                          \
+        }                                                                           \
+        MIDOUT_END();                                                               \
    }

 #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type)         \

--- a/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
+++ b/dnn/src/fallback/conv_bias/gi/fp32/do_conv_stride2.cpp
@@ -36,7 +36,7 @@ void conv_stride2::do_conv_2x2_stride2(
            rep(i, nn) {
                GI_FLOAT32_t _outp = GiLoadFloat32(outptr);

-                GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0);
+                GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0);

                GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0);  // 0 2 4 6
                GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1);  // 1 3 5 7
@@ -44,7 +44,7 @@ void conv_stride2::do_conv_2x2_stride2(
                _outp = GiSimdFmaLane(_outp, _r00, _k0123, 0);
                _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);

-                GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1);
+                GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);

                GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0);
                GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1);
@@ -94,8 +94,8 @@ void conv_stride2::do_conv_3x3_stride2(
            rep(i, nn) {
                GI_FLOAT32_t _outp = GiLoadFloat32(outptr);

-                GI_FLOAT32_V2_t _r0 = GiLd2qFloat32(r0);
-                GI_FLOAT32_V2_t _r0n = GiLd2qFloat32(r0 + 8);
+                GI_FLOAT32_V2_t _r0 = GiLoadUzipFloat32V2(r0);
+                GI_FLOAT32_V2_t _r0n = GiLoadUzipFloat32V2(r0 + 8);

                GI_FLOAT32_t _r00 = GiGetSubVectorFloat32V2(_r0, 0);  // 0 2 4 6
                GI_FLOAT32_t _r01 = GiGetSubVectorFloat32V2(_r0, 1);  // 1 3 5 7
@@ -106,8 +106,8 @@ void conv_stride2::do_conv_3x3_stride2(
                _outp = GiSimdFmaLane(_outp, _r01, _k0123, 1);
                _outp = GiSimdFmaLane(_outp, _r02, _k0123, 2);

-                GI_FLOAT32_V2_t _r1 = GiLd2qFloat32(r1);
-                GI_FLOAT32_V2_t _r1n = GiLd2qFloat32(r1 + 8);
+                GI_FLOAT32_V2_t _r1 = GiLoadUzipFloat32V2(r1);
+                GI_FLOAT32_V2_t _r1n = GiLoadUzipFloat32V2(r1 + 8);

                GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r1, 0);
                GI_FLOAT32_t _r11 = GiGetSubVectorFloat32V2(_r1, 1);
@@ -118,8 +118,8 @@ void conv_stride2::do_conv_3x3_stride2(
                _outp = GiSimdFmaLane(_outp, _r11, _k3456, 1);
                _outp = GiSimdFmaLane(_outp, _r12, _k3456, 2);

-                GI_FLOAT32_V2_t _r2 = GiLd2qFloat32(r2);
-                GI_FLOAT32_V2_t _r2n = GiLd2qFloat32(r2 + 8);
+                GI_FLOAT32_V2_t _r2 = GiLoadUzipFloat32V2(r2);
+                GI_FLOAT32_V2_t _r2n = GiLoadUzipFloat32V2(r2 + 8);

                GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r2, 0);
                GI_FLOAT32_t _r21 = GiGetSubVectorFloat32V2(_r2, 1);
@@ -176,8 +176,8 @@ void conv_stride2::do_conv_5x5_stride2(
            rep(i, nn) {
                GI_FLOAT32_t _sum = GiLoadFloat32(outptr);

-                GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0);
-                GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8);
+                GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0);
+                GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8);
                GI_FLOAT32_t _r0_8101214 =
                        GiGetSubVectorFloat32V2(_r00nx2, 0);  // 8 10 12 14
                GI_FLOAT32_t _r0_9111315 =
@@ -190,8 +190,8 @@ void conv_stride2::do_conv_5x5_stride2(
                GI_FLOAT32_t _r03 = GiExtqFloat32(_r01, _r0_9111315, 1);  // 3 5 7 9
                GI_FLOAT32_t _r04 = GiExtqFloat32(_r00, _r0_8101214, 2);  // 4 6 8 10

-                GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1);
-                GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8);
+                GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1);
+                GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8);
                GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0);
                GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1);
                GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0);
@@ -200,8 +200,8 @@ void conv_stride2::do_conv_5x5_stride2(
                GI_FLOAT32_t _r13 = GiExtqFloat32(_r11, _r1_9111315, 1);
                GI_FLOAT32_t _r14 = GiExtqFloat32(_r10, _r1_8101214, 2);

-                GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2);
-                GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8);
+                GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2);
+                GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8);
                GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0);
                GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1);
                GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0);
@@ -210,8 +210,8 @@ void conv_stride2::do_conv_5x5_stride2(
                GI_FLOAT32_t _r23 = GiExtqFloat32(_r21, _r2_9111315, 1);
                GI_FLOAT32_t _r24 = GiExtqFloat32(_r20, _r2_8101214, 2);

-                GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3);
-                GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8);
+                GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3);
+                GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8);
                GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0);
                GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1);
                GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0);
@@ -220,8 +220,8 @@ void conv_stride2::do_conv_5x5_stride2(
                GI_FLOAT32_t _r33 = GiExtqFloat32(_r31, _r3_9111315, 1);
                GI_FLOAT32_t _r34 = GiExtqFloat32(_r30, _r3_8101214, 2);

-                GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4);
-                GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8);
+                GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4);
+                GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8);
                GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0);
                GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1);
                GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0);
@@ -315,8 +315,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k0123 = GiLoadFloat32(k0);
                GI_FLOAT32_t _k4567 = GiLoadFloat32(k0 + 4);

-                GI_FLOAT32_V2_t _r00_02461357 = GiLd2qFloat32(r0);
-                GI_FLOAT32_V2_t _r00nx2 = GiLd2qFloat32(r0 + 8);
+                GI_FLOAT32_V2_t _r00_02461357 = GiLoadUzipFloat32V2(r0);
+                GI_FLOAT32_V2_t _r00nx2 = GiLoadUzipFloat32V2(r0 + 8);
                GI_FLOAT32_t _r0_8101214 =
                        GiGetSubVectorFloat32V2(_r00nx2, 0);  // 8 10 12 14
                GI_FLOAT32_t _r0_9111315 =
@@ -342,8 +342,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k78910 = GiLoadFloat32(k1);
                GI_FLOAT32_t _k11121314 = GiLoadFloat32(k1 + 4);

-                GI_FLOAT32_V2_t _r10_02461357 = GiLd2qFloat32(r1);
-                GI_FLOAT32_V2_t _r10nx2 = GiLd2qFloat32(r1 + 8);
+                GI_FLOAT32_V2_t _r10_02461357 = GiLoadUzipFloat32V2(r1);
+                GI_FLOAT32_V2_t _r10nx2 = GiLoadUzipFloat32V2(r1 + 8);
                GI_FLOAT32_t _r1_8101214 = GiGetSubVectorFloat32V2(_r10nx2, 0);
                GI_FLOAT32_t _r1_9111315 = GiGetSubVectorFloat32V2(_r10nx2, 1);
                GI_FLOAT32_t _r10 = GiGetSubVectorFloat32V2(_r10_02461357, 0);
@@ -365,8 +365,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k14151617 = GiLoadFloat32(k2);
                GI_FLOAT32_t _k18192021 = GiLoadFloat32(k2 + 4);

-                GI_FLOAT32_V2_t _r20_02461357 = GiLd2qFloat32(r2);
-                GI_FLOAT32_V2_t _r20nx2 = GiLd2qFloat32(r2 + 8);
+                GI_FLOAT32_V2_t _r20_02461357 = GiLoadUzipFloat32V2(r2);
+                GI_FLOAT32_V2_t _r20nx2 = GiLoadUzipFloat32V2(r2 + 8);
                GI_FLOAT32_t _r2_8101214 = GiGetSubVectorFloat32V2(_r20nx2, 0);
                GI_FLOAT32_t _r2_9111315 = GiGetSubVectorFloat32V2(_r20nx2, 1);
                GI_FLOAT32_t _r20 = GiGetSubVectorFloat32V2(_r20_02461357, 0);
@@ -388,8 +388,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k21222324 = GiLoadFloat32(k3);
                GI_FLOAT32_t _k25262728 = GiLoadFloat32(k3 + 4);

-                GI_FLOAT32_V2_t _r30_02461357 = GiLd2qFloat32(r3);
-                GI_FLOAT32_V2_t _r30nx2 = GiLd2qFloat32(r3 + 8);
+                GI_FLOAT32_V2_t _r30_02461357 = GiLoadUzipFloat32V2(r3);
+                GI_FLOAT32_V2_t _r30nx2 = GiLoadUzipFloat32V2(r3 + 8);
                GI_FLOAT32_t _r3_8101214 = GiGetSubVectorFloat32V2(_r30nx2, 0);
                GI_FLOAT32_t _r3_9111315 = GiGetSubVectorFloat32V2(_r30nx2, 1);
                GI_FLOAT32_t _r30 = GiGetSubVectorFloat32V2(_r30_02461357, 0);
@@ -411,8 +411,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k28293031 = GiLoadFloat32(k4);
                GI_FLOAT32_t _k32333435 = GiLoadFloat32(k4 + 4);

-                GI_FLOAT32_V2_t _r40_02461357 = GiLd2qFloat32(r4);
-                GI_FLOAT32_V2_t _r40nx2 = GiLd2qFloat32(r4 + 8);
+                GI_FLOAT32_V2_t _r40_02461357 = GiLoadUzipFloat32V2(r4);
+                GI_FLOAT32_V2_t _r40nx2 = GiLoadUzipFloat32V2(r4 + 8);
                GI_FLOAT32_t _r4_8101214 = GiGetSubVectorFloat32V2(_r40nx2, 0);
                GI_FLOAT32_t _r4_9111315 = GiGetSubVectorFloat32V2(_r40nx2, 1);
                GI_FLOAT32_t _r40 = GiGetSubVectorFloat32V2(_r40_02461357, 0);
@@ -434,8 +434,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k35363738 = GiLoadFloat32(k5);
                GI_FLOAT32_t _k39404142 = GiLoadFloat32(k5 + 4);

-                GI_FLOAT32_V2_t _r50_02461357 = GiLd2qFloat32(r5);
-                GI_FLOAT32_V2_t _r50nx2 = GiLd2qFloat32(r5 + 8);
+                GI_FLOAT32_V2_t _r50_02461357 = GiLoadUzipFloat32V2(r5);
+                GI_FLOAT32_V2_t _r50nx2 = GiLoadUzipFloat32V2(r5 + 8);
                GI_FLOAT32_t _r5_8101214 = GiGetSubVectorFloat32V2(_r50nx2, 0);
                GI_FLOAT32_t _r5_9111315 = GiGetSubVectorFloat32V2(_r50nx2, 1);
                GI_FLOAT32_t _r50 = GiGetSubVectorFloat32V2(_r50_02461357, 0);
@@ -457,8 +457,8 @@ void conv_stride2::do_conv_7x7_stride2(
                GI_FLOAT32_t _k42434445 = GiLoadFloat32(k6);
                GI_FLOAT32_t _k45464748 = GiLoadFloat32(k6 + 3);

-                GI_FLOAT32_V2_t _r60_02461357 = GiLd2qFloat32(r6);
-                GI_FLOAT32_V2_t _r60nx2 = GiLd2qFloat32(r6 + 8);
+                GI_FLOAT32_V2_t _r60_02461357 = GiLoadUzipFloat32V2(r6);
+                GI_FLOAT32_V2_t _r60nx2 = GiLoadUzipFloat32V2(r6 + 8);
                GI_FLOAT32_t _r6_8101214 = GiGetSubVectorFloat32V2(_r60nx2, 0);
                GI_FLOAT32_t _r6_9111315 = GiGetSubVectorFloat32V2(_r60nx2, 1);
                GI_FLOAT32_t _r60 = GiGetSubVectorFloat32V2(_r60_02461357, 0);

--- a/dnn/src/fallback/general_intrinsic/gi_common.h
+++ b/dnn/src/fallback/general_intrinsic/gi_common.h
@@ -151,6 +151,8 @@ typedef int32x4x2_t GI_INT32_V2_t;
 typedef int32x4x4_t GI_INT32_V4_t;
 typedef int16x8x2_t GI_INT16_V2_t;
 typedef int8x16x2_t GI_INT8_V2_t;
+typedef int8x16x3_t GI_INT8_V3_t;
+typedef int8x16x4_t GI_INT8_V4_t;
 typedef int64x2_t GI_INT64_t;
 #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)

@@ -302,6 +304,8 @@ typedef vint32m1x2_t GI_INT32_V2_t;
 typedef vint32m1x4_t GI_INT32_V4_t;
 typedef vint16m1x2_t GI_INT16_V2_t;
 typedef vint8m1x2_t GI_INT8_V2_t;
+typedef vint8m1x3_t GI_INT8_V3_t;
+typedef vint8m1x4_t GI_INT8_V4_t;
 //! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as
 //! a workaround, we use vfloat32m1_t instead
 typedef vfloat32m1_t float32x2_t;
@@ -390,6 +394,14 @@ typedef struct {
    GI_INT8_t val[2];
 } GI_INT8_V2_t;

+typedef struct {
+    GI_INT8_t val[3];
+} GI_INT8_V3_t;
+
+typedef struct {
+    GI_INT8_t val[4];
+} GI_INT8_V4_t;
+
 #endif
 //! variable length type intrinsic can not be a member of c++ class
 //! caused by can not do sizeof at build stage, for example RVV and SVE
@@ -417,6 +429,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
 #define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index)

 #define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index)
+#define GiGetSubVectorInt8V3(s, index) vget_i8m1x3_i8m1(s, index)
+#define GiGetSubVectorInt8V4(s, index) vget_i8m1x4_i8m1(s, index)

 //! insert subvector
 #define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s)
@@ -570,6 +584,8 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
 #define GiGetSubVectorInt16V2(s, index) s.val[index]

 #define GiGetSubVectorInt8V2(s, index)       s.val[index]
+#define GiGetSubVectorInt8V3(s, index)       s.val[index]
+#define GiGetSubVectorInt8V4(s, index)       s.val[index]

 //! insert subvector
 #define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s

--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -381,31 +381,6 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) {
 #endif
 }

-GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) {
-#if defined(GI_NEON_INTRINSICS)
-    return vld2q_f32(Buffer);
-#elif defined(GI_SSE2_INTRINSICS)
-    GI_FLOAT32_V2_t v;
-    v.val[0] = GiLoadFloat32(Buffer);
-    v.val[1] = GiLoadFloat32((Buffer + 4));
-    v = GiUzpqFloat32(v.val[0], v.val[1]);
-    return v;
-#elif defined(GI_RVV_INTRINSICS)
-    return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
-#else
-    GI_FLOAT32_V2_t ret;
-    ret.val[0][0] = Buffer[0];
-    ret.val[0][1] = Buffer[2];
-    ret.val[0][2] = Buffer[4];
-    ret.val[0][3] = Buffer[6];
-    ret.val[1][0] = Buffer[1];
-    ret.val[1][1] = Buffer[3];
-    ret.val[1][2] = Buffer[5];
-    ret.val[1][3] = Buffer[7];
-    return ret;
-#endif
-}
-
 #if defined(GI_NEON_INTRINSICS)
 #define GiExtqFloat32(a, b, n) vextq_f32(a, b, n)
 #elif defined(GI_SSE2_INTRINSICS)
@@ -1709,6 +1684,31 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
 #endif
 }

+GI_FORCEINLINE GI_FLOAT32_V2_t GiLoadUzipFloat32V2(const float* Buffer) {
+#if defined(GI_NEON_INTRINSICS)
+    return vld2q_f32(Buffer);
+#elif defined(GI_SSE2_INTRINSICS)
+    GI_FLOAT32_V2_t v;
+    v.val[0] = GiLoadFloat32(Buffer);
+    v.val[1] = GiLoadFloat32((Buffer + 4));
+    v = GiUzpqFloat32(v.val[0], v.val[1]);
+    return v;
+#elif defined(GI_RVV_INTRINSICS)
+    return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
+#else
+    GI_FLOAT32_V2_t ret;
+    ret.val[0][0] = Buffer[0];
+    ret.val[0][1] = Buffer[2];
+    ret.val[0][2] = Buffer[4];
+    ret.val[0][3] = Buffer[6];
+    ret.val[1][0] = Buffer[1];
+    ret.val[1][1] = Buffer[3];
+    ret.val[1][2] = Buffer[5];
+    ret.val[1][3] = Buffer[7];
+    return ret;
+#endif
+}
+
 GI_FORCEINLINE
 GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) {
 #if defined(GI_NEON_INTRINSICS)

--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
@@ -86,6 +86,152 @@ GI_INT8_t GiLoadInt8(const void* Buffer) {
 #endif
 }

+GI_FORCEINLINE
+GI_INT8_V2_t GiLoadUzipInt8V2(const void* Buffer) {
+#if defined(GI_NEON_INTRINSICS)
+    return vld2q_s8((int8_t*)Buffer);
+#elif defined(GI_SSE42_INTRINSICS)
+    GI_INT8_t v0, v1;
+    v0 = _mm_loadu_si128((const __m128i*)Buffer);
+    v1 = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
+    GI_INT8_V2_t ret;
+    v0 = _mm_shuffle_epi8(
+            v0, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
+    v1 = _mm_shuffle_epi8(
+            v1, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
+    ret.val[0] = _mm_unpacklo_epi64(v0, v1);
+    ret.val[1] = _mm_unpackhi_epi64(v0, v1);
+    return ret;
+#elif defined(GI_RVV_INTRINSICS)
+    return vlseg2e8_v_i8m1x2((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
+#else
+    int8_t data[2 * GI_SIMD_LEN_BYTE];
+    const int8_t* ptr = (int8_t*)Buffer;
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
+        data[i] = ptr[2 * i];
+        data[GI_SIMD_LEN_BYTE + i] = ptr[2 * i + 1];
+    }
+    GI_INT8_V2_t ret;
+    ret.val[0] = GiLoadInt8(data);
+    ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE);
+    return ret;
+#endif
+}
+
+GI_FORCEINLINE
+GI_INT8_V3_t GiLoadUzipInt8V3(const void* Buffer) {
+#if defined(GI_NEON_INTRINSICS)
+    return vld3q_s8((int8_t*)Buffer);
+#elif defined(GI_SSE42_INTRINSICS)
+    GI_INT8_V3_t v;
+    __m128i tmp0, tmp1, tmp2, tmp3;
+    static const int8_t mask8_0[16] = {0, 3,  6,  9, 12, 15, 1,  4,
+                                       7, 10, 13, 2, 5,  8,  11, 14};
+    static const int8_t mask8_1[16] = {2, 5,  8,  11, 14, 0, 3,  6,
+                                       9, 12, 15, 1,  4,  7, 10, 13};
+    static const int8_t mask8_2[16] = {1,  4,  7, 10, 13, 2, 5,  8,
+                                       11, 14, 0, 3,  6,  9, 12, 15};
+
+    v.val[0] = _mm_loadu_si128((const __m128i*)Buffer);
+    v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
+    v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32));
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0);
+    tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1);
+    tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2);
+
+    tmp3 = _mm_slli_si128(tmp0, 10);
+    tmp3 = _mm_alignr_epi8(tmp1, tmp3, 10);  // a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
+    tmp3 = _mm_slli_si128(tmp3, 5);          // 0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
+    tmp3 = _mm_srli_si128(tmp3, 5);          // a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
+    v.val[0] = _mm_slli_si128(tmp2, 11);     // 0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
+    v.val[0] = _mm_or_si128(v.val[0], tmp3);
+
+    tmp3 = _mm_slli_si128(tmp0, 5);          // 0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
+    tmp3 = _mm_srli_si128(tmp3, 11);         // a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
+    v.val[1] = _mm_srli_si128(tmp1, 5);      // b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 5);  // 0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] = _mm_or_si128(v.val[1], tmp3);
+    v.val[1] = _mm_slli_si128(v.val[1], 5);  // 0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
+    v.val[1] = _mm_srli_si128(v.val[1], 5);  // a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_srli_si128(tmp2, 5);          // c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_slli_si128(tmp3, 11);         // 0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
+    v.val[1] = _mm_or_si128(v.val[1], tmp3);
+    tmp3 = _mm_srli_si128(tmp2, 10);         // c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
+    tmp3 = _mm_slli_si128(tmp3, 10);         // 0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
+    v.val[2] = _mm_srli_si128(tmp1, 11);     // b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2], 5);  // 0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
+    v.val[2] =
+            _mm_or_si128(v.val[2], tmp3);  // 0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    tmp0 = _mm_srli_si128(tmp0, 11);       // a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
+    v.val[2] = _mm_or_si128(v.val[2], tmp0);
+    return v;
+#elif defined(GI_RVV_INTRINSICS)
+    return vlseg3e8_v_i8m1x3((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
+#else
+    int8_t data[3 * GI_SIMD_LEN_BYTE];
+    const int8_t* ptr = (int8_t*)Buffer;
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
+        data[i] = ptr[3 * i];
+        data[GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 1];
+        data[2 * GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 2];
+    }
+    GI_INT8_V3_t ret;
+    ret.val[0] = GiLoadInt8(data);
+    ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE);
+    ret.val[2] = GiLoadInt8(data + 2 * GI_SIMD_LEN_BYTE);
+    return ret;
+#endif
+}
+
+GI_FORCEINLINE
+GI_INT8_V4_t GiLoadUzipInt8V4(const void* Buffer) {
+#if defined(GI_NEON_INTRINSICS)
+    return vld4q_s8((int8_t*)Buffer);
+#elif defined(GI_SSE2_INTRINSICS)
+    GI_INT8_V4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+
+    v.val[0] = _mm_loadu_si128((const __m128i*)Buffer);
+    v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16));
+    v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32));
+    v.val[3] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 48));
+
+    tmp0 = _mm_unpacklo_epi8(v.val[0], v.val[1]);
+    tmp1 = _mm_unpacklo_epi8(v.val[2], v.val[3]);
+    tmp2 = _mm_unpackhi_epi8(v.val[0], v.val[1]);
+    tmp3 = _mm_unpackhi_epi8(v.val[2], v.val[3]);
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2);
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2);
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3);
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3);
+
+    tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2]);
+    tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2]);
+    tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3]);
+    tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3]);
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2);
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2);
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3);
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3);
+    return v;
+#elif defined(GI_RVV_INTRINSICS)
+    return vlseg4e8_v_i8m1x4((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t));
+#else
+    GI_INT8_V4_t ret;
+    const int8_t* ptr = (int8_t*)Buffer;
+    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
+        ret.val[0][i] = ptr[4 * i];
+        ret.val[1][i] = ptr[4 * i + 1];
+        ret.val[2][i] = ptr[4 * i + 2];
+        ret.val[3][i] = ptr[4 * i + 3];
+    }
+    return ret;
+#endif
+}
+
 GI_FORCEINLINE
 void GiStoreInt32(void* Buffer, GI_INT32_t Vector) {
 #if defined(GI_NEON_INTRINSICS)

--- a/dnn/src/fallback/pooling/gi/pooling_helper.h
+++ b/dnn/src/fallback/pooling/gi/pooling_helper.h
@@ -471,7 +471,7 @@ void do_average_pooling_3x3_s2x2_gi(
        int odd_offset = 0, even_offset = 0;

        for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) {
-            auto s0 = GiLd2qFloat32(sptr + iw);
+            auto s0 = GiLoadUzipFloat32V2(sptr + iw);
            GiStoreFloat32(even + even_offset, GiGetSubVectorFloat32V2(s0, 0));
            GiStoreFloat32(odd + odd_offset, GiGetSubVectorFloat32V2(s0, 1));
            even_offset += MEGDNN_SIMD_WIDTH;

--- a/dnn/src/fallback/reduce/opr_impl.cpp
+++ b/dnn/src/fallback/reduce/opr_impl.cpp
@@ -186,8 +186,15 @@ bool ReduceImpl::exec_optimized(
 #define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                           \
    if (C == 1) {                                                                 \
        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                  \
+        using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>;         \
        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
                do_reduce = Exec<_Reducer, true>::do_reduce;                      \
+        if (B == 2)                                                               \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;      \
+        if (B == 3)                                                               \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;      \
+        if (B == 4)                                                               \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;      \
        MIDOUT_BEGIN(                                                             \
                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
                midout_iv(0)) {                                                   \

--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -46,8 +46,8 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
        vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef));
    }
    MeanReducer() = default;
-    void feed(const int8_t* val) {
-        const GI_INT8_t vval = GiLoadInt8(val);
+    void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); }
+    void feed_vector(const GI_INT8_t vval) {
        const GI_INT16_t vval_low = GiMoveLowLongInt8(vval);
        const GI_INT16_t vval_high = GiMoveHighLongInt8(vval);

@@ -121,9 +121,10 @@ struct MeanReducer<dt_float32, float, float, false> {
        res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
    }
    MeanReducer() = default;
-    void feed(const float* val) {
+    void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }
+    void inline feed_vector(const GI_FLOAT32_t& val) {
        res = GiFloat32Type2FixLenType(
-                GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res)));
+                GiAddFloat32(val, GiFixLenType2GiFloat32Type(res)));
    }
    void feed_remain(const float* val) { remain += *val; }
    void post(float* dst) {
@@ -172,31 +173,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
 #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
 #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);

-#define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                                     \
-    template <>                                                                    \
-    struct _mode##Reducer<dt_float32, float, float, false> {                       \
-        using ctype = float;                                                       \
-        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);        \
-        GI_FLOAT32_FIXLEN_t res;                                                   \
-        float remain;                                                              \
-        _mode##Reducer(DType, size_t) {                                            \
-            res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));             \
-            remain = _init;                                                        \
-        }                                                                          \
-        _mode##Reducer() = default;                                                \
-        void feed(const float* val) {                                              \
-            GI_FLOAT32_t vval = GiLoadFloat32(val);                                \
-            res = GiFloat32Type2FixLenType(                                        \
-                    Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \
-        }                                                                          \
-        void feed_remain(const float* val) {                                       \
-            using namespace std;                                                   \
-            remain = _Mode##_NAN(*val, remain);                                    \
-        }                                                                          \
-        void post(float* dst) {                                                    \
-            GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));                  \
-        }                                                                          \
-        void post_remain(float* dst) { *dst = remain; }                            \
+#define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                                    \
+    template <>                                                                   \
+    struct _mode##Reducer<dt_float32, float, float, false> {                      \
+        using ctype = float;                                                      \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);       \
+        GI_FLOAT32_FIXLEN_t res;                                                  \
+        float remain;                                                             \
+        _mode##Reducer(DType, size_t) {                                           \
+            res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));            \
+            remain = _init;                                                       \
+        }                                                                         \
+        _mode##Reducer() = default;                                               \
+        void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }          \
+        void inline feed_vector(const GI_FLOAT32_t& val) {                        \
+            res = GiFloat32Type2FixLenType(                                       \
+                    Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), val)); \
+        }                                                                         \
+        void feed_remain(const float* val) {                                      \
+            using namespace std;                                                  \
+            remain = _Mode##_NAN(*val, remain);                                   \
+        }                                                                         \
+        void post(float* dst) {                                                   \
+            GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));                 \
+        }                                                                         \
+        void post_remain(float* dst) { *dst = remain; }                           \
    }

 REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
@@ -246,10 +247,10 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
            remain = _init;                                                        \
        }                                                                          \
        _mode##Reducer() = default;                                                \
-        void feed(const int8_t* val) {                                             \
-            GI_INT8_t vval = GiLoadInt8(val);                                      \
+        void feed(const int8_t* val) { feed_vector(GiLoadInt8(val)); }             \
+        void inline feed_vector(GI_INT8_t val) {                                   \
            res = GiInt8Type2FixLenType(                                           \
-                    Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval));      \
+                    Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), val));       \
        }                                                                          \
        void feed_remain(const int8_t* val) {                                      \
            using namespace std;                                                   \
@@ -304,32 +305,32 @@ REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f);
 REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
 #undef REDUCER_SUM_PRODUCT_C1

-#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init)                         \
-    template <>                                                                 \
-    struct _mode##Reducer<dt_float32, float, float, false> {                    \
-        using ctype = float;                                                    \
-        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);     \
-        GI_FLOAT32_FIXLEN_t res;                                                \
-        float remain;                                                           \
-        _mode##Reducer(DType, size_t) {                                         \
-            res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));          \
-            remain = _init;                                                     \
-        }                                                                       \
-        _mode##Reducer() = default;                                             \
-        void feed(const float* val) {                                           \
-            GI_FLOAT32_t vval = GiLoadFloat32(val);                             \
-            res = GiFloat32Type2FixLenType(                                     \
-                    Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \
-        }                                                                       \
-        void feed_remain(const float* val) {                                    \
-            using namespace std;                                                \
-            auto op = _op<float>();                                             \
-            remain = op(remain, (*val));                                        \
-        }                                                                       \
-        void post(float* dst) {                                                 \
-            GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));               \
-        }                                                                       \
-        void post_remain(float* dst) { *dst = remain; }                         \
+#define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init)                        \
+    template <>                                                                \
+    struct _mode##Reducer<dt_float32, float, float, false> {                   \
+        using ctype = float;                                                   \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);    \
+        GI_FLOAT32_FIXLEN_t res;                                               \
+        float remain;                                                          \
+        _mode##Reducer(DType, size_t) {                                        \
+            res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init));         \
+            remain = _init;                                                    \
+        }                                                                      \
+        _mode##Reducer() = default;                                            \
+        void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }       \
+        void inline feed_vector(GI_FLOAT32_t val) {                            \
+            res = GiFloat32Type2FixLenType(                                    \
+                    Gi##_Mode##Float32(val, GiFixLenType2GiFloat32Type(res))); \
+        }                                                                      \
+        void feed_remain(const float* val) {                                   \
+            using namespace std;                                               \
+            auto op = _op<float>();                                            \
+            remain = op(remain, (*val));                                       \
+        }                                                                      \
+        void post(float* dst) {                                                \
+            GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res));              \
+        }                                                                      \
+        void post_remain(float* dst) { *dst = remain; }                        \
    }

 REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f);
@@ -378,15 +379,16 @@ struct SumSqrReducer<dt_float32, float, float, false> {
        res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f));
    }
    SumSqrReducer() = default;
-    void feed(const float* val) {
-        GI_FLOAT32_t vval = GiLoadFloat32(val);
+    void feed(const float* val) { feed_vector(GiLoadFloat32(val)); }
+    void inline feed_vector(GI_FLOAT32_t src) {
        res = GiFloat32Type2FixLenType(GiAddFloat32(
-                GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res)));
+                GiMultiplyFloat32(src, src), GiFixLenType2GiFloat32Type(res)));
    }
    void feed_remain(const float* val) { remain += (*val) * (*val); }
    void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); }
    void post_remain(float* dst) { *dst = remain; }
 };
+
 /**************************************do reduce*************************/

 template <typename Reducer, bool C1>
@@ -446,6 +448,57 @@ struct Exec<Reducer, false> {
    }
 };

+template <typename Reducer, typename dtype, size_t B>
+struct ExecC1SmallB {
+    static void do_reduce(
+            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
+};
+
+#define ImplementC1SmallB(_ctype, _gi_type, _gi_ins)                                 \
+    template <typename Reducer, size_t B>                                            \
+    struct ExecC1SmallB<Reducer, _ctype, B> {                                        \
+        static void do_reduce(                                                       \
+                const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t,   \
+                size_t) {                                                            \
+            size_t a = 0;                                                            \
+            for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) {          \
+                Reducer reducer(src_dtype, B);                                       \
+                auto src_ptr = src + a * B;                                          \
+                if (B == 4) {                                                        \
+                    GI_##_gi_type##_V4_t data_v4 = GiLoadUzip##_gi_ins##V4(src_ptr); \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 0));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 1));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 2));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V4(data_v4, 3));    \
+                }                                                                    \
+                if (B == 3) {                                                        \
+                    GI_##_gi_type##_V3_t data_v3 = GiLoadUzip##_gi_ins##V3(src_ptr); \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 0));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 1));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V3(data_v3, 2));    \
+                }                                                                    \
+                if (B == 2) {                                                        \
+                    GI_##_gi_type##_V2_t data_v2 = GiLoadUzip##_gi_ins##V2(src_ptr); \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 0));    \
+                    reducer.feed_vector(GiGetSubVector##_gi_ins##V2(data_v2, 1));    \
+                }                                                                    \
+                reducer.post(dst);                                                   \
+                dst += Reducer::SIMD_WIDTH;                                          \
+            }                                                                        \
+            for (; a < A; a++) {                                                     \
+                Reducer reducer(src_dtype, B);                                       \
+                auto src_ptr = src + a * B;                                          \
+                for (size_t i = 0; i < B; i++)                                       \
+                    reducer.feed_remain(src_ptr + i);                                \
+                reducer.post_remain(dst);                                            \
+                dst++;                                                               \
+            }                                                                        \
+        }                                                                            \
+    }
+
+ImplementC1SmallB(float, FLOAT32, Float32);
+ImplementC1SmallB(int8_t, INT8, Int8);
+
 }  // namespace

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/resize/gi/resize_cv.cpp
+++ b/dnn/src/fallback/resize/gi/resize_cv.cpp
@@ -565,7 +565,8 @@ struct ResizeAreaFastVec_SIMD_32f {

        if (cn == 1) {
            for (; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) {
-                GI_FLOAT32_V2_t v_row0 = GiLd2qFloat32(S0), v_row1 = GiLd2qFloat32(S1);
+                GI_FLOAT32_V2_t v_row0 = GiLoadUzipFloat32V2(S0),
+                                v_row1 = GiLoadUzipFloat32V2(S1);

                GI_FLOAT32_t v_dst0 = GiAddFloat32(
                        GiGetSubVectorFloat32V2(v_row0, 0),

--- a/dnn/test/arm_common/reduce.cpp
+++ b/dnn/test/arm_common/reduce.cpp
@@ -29,7 +29,7 @@ TEST_F(ARM_COMMON, REDUCE) {
            for (int32_t axis : {0, 1, 2}) {
                for (size_t A : {1, 3, 5}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
-                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);
@@ -51,7 +51,7 @@ TEST_F(ARM_COMMON, REDUCE) {
            for (int32_t axis : {0, 1, 2}) {
                for (size_t A : {1, 3, 5}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
-                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);

--- a/dnn/test/fallback/gi.cpp
+++ b/dnn/test/fallback/gi.cpp
@@ -1356,12 +1356,12 @@ TEST_F(FALLBACK, GiSt1Float32) {
    ASSERT_EQ(ret[1], s0[1]);
 }

-TEST_F(FALLBACK, GiLd2qFloat32) {
+TEST_F(FALLBACK, GiLoadUzipFloat32) {
    GI_FLOAT32_V2_t ret;
    std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f};

    force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2);
-    ret = GiLd2qFloat32(s0.data());
+    ret = GiLoadUzipFloat32V2(s0.data());

    std::vector<float> naive0;
    std::vector<float> naive1;
@@ -2590,6 +2590,61 @@ TEST_F(FALLBACK, GiLoadInt8) {
    }
 }

+TEST_F(FALLBACK, GiLoadUzipInt8V2) {
+    std::vector<int8_t> s0{9,   2, -128, 127, 2,    45, 3,  0,   11, 2,  -128,
+                           127, 2, 55,   3,   -1,   7,  8,  -18, 17, 12, 35,
+                           7,   8, 10,   22,  -108, 27, 21, 45,  13, -11};
+    GI_INT8_V2_t ret;
+
+    force_memset_ret((void*)&ret, 2 * GI_SIMD_LEN_BYTE);
+    ret = GiLoadUzipInt8V2(s0.data());
+
+    auto p = (int8_t*)&ret;
+    for (size_t i = 0; i < SIMD_LEN_8; i++) {
+        ASSERT_EQ(p[i], s0[2 * i]);
+        ASSERT_EQ(p[SIMD_LEN_8 + i], s0[2 * i + 1]);
+    }
+}
+
+TEST_F(FALLBACK, GiLoadUzipInt8V3) {
+    std::vector<int8_t> s0{9,   2,  -128, 127, 2,  45, 3,    0,   11, 2,  -128, 127,
+                           2,   55, 3,    -1,  7,  8,  -18,  17,  12, 35, 7,    8,
+                           10,  22, -108, 27,  21, 45, 13,   -11, 11, 14, -11,  12,
+                           111, 32, 6,    9,   16, 29, -118, 67,  28, 15, 19,   -10};
+    GI_INT8_V3_t ret;
+
+    force_memset_ret((void*)&ret, 3 * GI_SIMD_LEN_BYTE);
+    ret = GiLoadUzipInt8V3(s0.data());
+
+    auto p = (int8_t*)&ret;
+    for (size_t i = 0; i < SIMD_LEN_8; i++) {
+        ASSERT_EQ(p[i], s0[3 * i]);
+        ASSERT_EQ(p[SIMD_LEN_8 + i], s0[3 * i + 1]);
+        ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[3 * i + 2]);
+    }
+}
+
+TEST_F(FALLBACK, GiLoadUzipInt8V4) {
+    std::vector<int8_t> s0{
+            9,  2,  -128, 127, 2,   45, 3,  0,  11, 2,  -128, 127, 2,  55, 3,  -1,
+            7,  8,  -18,  17,  12,  35, 7,  8,  10, 22, -108, 27,  21, 45, 13, -11,
+            11, 14, -11,  12,  111, 32, 6,  9,  16, 29, -118, 67,  28, 15, 19, -10,
+            9,  4,  -108, 27,  22,  43, 13, 10, 31, 12, -108, 117, 22, 25, 31, -10,
+    };
+    GI_INT8_V4_t ret;
+
+    force_memset_ret((void*)&ret, 4 * GI_SIMD_LEN_BYTE);
+    ret = GiLoadUzipInt8V4(s0.data());
+
+    auto p = (int8_t*)&ret;
+    for (size_t i = 0; i < SIMD_LEN_8; i++) {
+        ASSERT_EQ(p[i], s0[4 * i]);
+        ASSERT_EQ(p[SIMD_LEN_8 + i], s0[4 * i + 1]);
+        ASSERT_EQ(p[2 * SIMD_LEN_8 + i], s0[4 * i + 2]);
+        ASSERT_EQ(p[3 * SIMD_LEN_8 + i], s0[4 * i + 3]);
+    }
+}
+
 TEST_F(FALLBACK, GiStoreInt32) {
    GI_INT32_t src0;
    std::vector<int32_t> s0{1, 2, -200, 999};

--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
 #include "test/fallback/fixture.h"

 #include "megdnn/oprs.h"
+#include "test/common/benchmarker.h"
 #include "test/common/checker.h"
 #include "test/common/task_record_check.h"
 #include "test/common/tensor.h"
@@ -27,9 +28,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
                     dtype::Float32(), dtype::Float16(), dtype::QuantizedS8(1.3f),
                     dtype::Quantized8Asymm(1.3f, static_cast<uint8_t>(3))})
            for (int32_t axis : {0, 1, 2}) {
-                for (size_t A : {1, 3, 5}) {
+                for (size_t A : {1, 3, 5, 20}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
-                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);
@@ -49,9 +50,9 @@ TEST_F(FALLBACK, REDUCE_FULL) {
    for (auto mode : {Mode::SUM, Mode::PRODUCT, Mode::SUM_SQR})
        for (auto dtype : std::vector<DType>{dtype::Float32(), dtype::Float16()})
            for (int32_t axis : {0, 1, 2}) {
-                for (size_t A : {1, 3, 5}) {
+                for (size_t A : {1, 3, 5, 20}) {
                    for (size_t B : {4, 6, 9, 16, 33, 45}) {
-                        for (size_t C : {4, 6, 9, 16, 33, 45}) {
+                        for (size_t C : {2, 3, 4, 6, 9, 16, 33, 45}) {
                            TensorShape shape{A, B, C};
                            Param param(mode, axis);
                            Config config(param, dtype, shape);
@@ -301,4 +302,56 @@ TEST_F(FALLBACK, REDUCE_RECORD) {
    }
 }

+#if MEGDNN_WITH_BENCHMARK
+TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
+    auto run = [&]() {
+        Benchmarker<Reduce> benchmarker_reduce(handle());
+        Benchmarker<Convolution> benchmarker_conv(handle());
+        benchmarker_reduce.set_display(false);
+        benchmarker_conv.set_display(false);
+        constexpr size_t RUNS = 50;
+        benchmarker_reduce.set_times(RUNS);
+        benchmarker_conv.set_times(RUNS);
+        param::Reduce param;
+        param.axis = 3;
+        param.mode = param::Reduce::Mode::SUM;
+        benchmarker_reduce.set_param(param);
+        param::Convolution param_conv;
+        benchmarker_conv.set_param(param_conv);
+
+        {
+            TensorLayout src({24, 240, 128, 2}, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+            TensorLayout conv_src({24, 2, 240, 128}, dtype::Float32());
+            TensorLayout conv_weight({1, 2, 1, 1}, dtype::Float32());
+            auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
+
+            printf("case 1: reduce use time %fms, convolution use time %fms\n", reduce,
+                   conv);
+        }
+        {
+            TensorLayout src({24, 240, 128, 3}, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+            TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32());
+            TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32());
+            auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
+
+            printf("case 2: reduce use time %fms, convolution use time %fms\n", reduce,
+                   conv);
+        }
+        {
+            TensorLayout src({24, 240, 128, 4}, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+            TensorLayout conv_src({24, 4, 240, 128}, dtype::Float32());
+            TensorLayout conv_weight({1, 4, 1, 1}, dtype::Float32());
+            auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
+
+            printf("case 3: reduce use time %fms, convolution use time %fms\n", reduce,
+                   conv);
+        }
+    };
+    run();
+}
+#endif
+
 // vim: syntax=cpp.doxygen
--- a/dnn/test/x86/convolution.cpp
+++ b/dnn/test/x86/convolution.cpp
@@ -467,6 +467,33 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) {
 }
 #endif

+TEST_F(X86, BENCHMARK_REDUCE_VS_CONV) {
+    auto run = [&]() {
+        Benchmarker<Reduce> benchmarker_reduce(handle());
+        Benchmarker<Convolution> benchmarker_conv(handle());
+        benchmarker_reduce.set_display(false);
+        benchmarker_conv.set_display(false);
+        constexpr size_t RUNS = 50;
+        benchmarker_reduce.set_times(RUNS);
+        benchmarker_conv.set_times(RUNS);
+        param::Reduce param;
+        param.axis = 3;
+        param.mode = param::Reduce::Mode::SUM;
+        benchmarker_reduce.set_param(param);
+        param::Convolution param_conv;
+        benchmarker_conv.set_param(param_conv);
+
+        TensorLayout src({24, 240, 128, 3}, dtype::Float32());
+        auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+        TensorLayout conv_src({24, 3, 240, 128}, dtype::Float32());
+        TensorLayout conv_weight({1, 3, 1, 1}, dtype::Float32());
+        auto conv = benchmarker_conv.execs({conv_src, conv_weight, {}}) / RUNS;
+
+        printf("reduce use time %fms, convolution use time %fms\n", reduce, conv);
+    };
+    run();
+}
+
 #endif

 }  // namespace test