From 1b9438079452a3615f4c9f3dac22e083727ea511 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 1 Aug 2022 15:52:38 +0800
Subject: [PATCH] fix(dnn): fix reduce sum/mean error when b is large

GitOrigin-RevId: d1bae619b1835ebe7ef7656766700720c3a99d37
---
 dnn/src/fallback/reduce/opr_impl.cpp |  99 ++++++----
 dnn/src/fallback/reduce/opr_impl.h   |   3 +
 dnn/src/fallback/reduce/reducer.h    | 277 ++++++++++++++++++++++++++-
 dnn/test/fallback/reduce.cpp         |  72 +++++++
 4 files changed, 410 insertions(+), 41 deletions(-)

diff --git a/dnn/src/fallback/reduce/opr_impl.cpp b/dnn/src/fallback/reduce/opr_impl.cpp
index c87bfca0a..5bc78dd63 100644
--- a/dnn/src/fallback/reduce/opr_impl.cpp
+++ b/dnn/src/fallback/reduce/opr_impl.cpp
@@ -5,7 +5,6 @@
 
 #include "midout.h"
 #include "reducer.h"
-#include "src/common/reduce_helper.h"
 
 MIDOUT_DECL(megdnn_fb_reduce_op)
 MIDOUT_DECL(megdnn_fb_reduce_c)
@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT {
 namespace megdnn {
 namespace fallback {
 
+size_t ReduceImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    MEGDNN_MARK_USED_VAR(src);
+    MEGDNN_MARK_USED_VAR(dst);
+
+    if (src.dtype.enumv() == DTypeEnum::Float32 &&
+        (param().mode == Mode::MEAN || param().mode == Mode::SUM ||
+         param().mode == Mode::SUM_SQR)) {
+        size_t A, B, C;
+        reduce::get_ABC(src, A, B, C, param().axis);
+        if (C == 1) {
+            // Using B = 247 as an example, you can understand why these parameters exist
+            size_t _60xT_in_4 = (60 * 3) / 4;  // T = 3
+            size_t _60xX_in_4 = 4;             // 0 < X < T, X = 1,2.
+            size_t _XXxT_in_4 = 4;
+            return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float));
+        }
+    }
+    return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst);
+}
+
 void ReduceImpl::exec(
         _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
     check_exec(src.layout, dst.layout, workspace.size);
@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback(
 }
 
 bool ReduceImpl::exec_optimized(
-        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
+        _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
     size_t A, B, C;
     reduce::get_ABC(src.layout, A, B, C, param().axis);
     bool execed = false;
     using Mode = param::Reduce::Mode;
-#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                           \
-    if (C == 1) {                                                                 \
-        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                  \
-        using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>;         \
-        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
-                do_reduce = Exec<_Reducer, true>::do_reduce;                      \
-        if (B == 2)                                                               \
-            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;      \
-        if (B == 3)                                                               \
-            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;      \
-        if (B == 4)                                                               \
-            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;      \
-        MIDOUT_BEGIN(                                                             \
-                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
-                midout_iv(0)) {                                                   \
-            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
-                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
-                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
-            execed = true;                                                        \
-        }                                                                         \
-        MIDOUT_END();                                                             \
-    } else {                                                                      \
-        using _Reducer = Reducer<dtype, ctype, comp_type, false>;                 \
-        std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)>  \
-                do_reduce = Exec<_Reducer, false>::do_reduce;                     \
-        MIDOUT_BEGIN(                                                             \
-                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,        \
-                midout_iv(1)) {                                                   \
-            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                               \
-                    reinterpret_cast<ctype*>(src.raw_ptr()),                      \
-                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
-            execed = true;                                                        \
-        }                                                                         \
-        MIDOUT_END();                                                             \
+
+#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type)                         \
+    if (C == 1) {                                                               \
+        using _Reducer = Reducer<dtype, ctype, comp_type, true>;                \
+        using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>;       \
+        std::function<void(                                                     \
+                const ctype*, ctype*, DType, size_t, size_t, size_t,            \
+                _megdnn_workspace)>                                             \
+                do_reduce = Exec<_Reducer, true>::do_reduce;                    \
+        if (B == 2)                                                             \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce;    \
+        if (B == 3)                                                             \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce;    \
+        if (B == 4)                                                             \
+            do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce;    \
+        MIDOUT_BEGIN(                                                           \
+                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,      \
+                midout_iv(0)) {                                                 \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                             \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                    \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
+                    workspace));                                                \
+            execed = true;                                                      \
+        }                                                                       \
+        MIDOUT_END();                                                           \
+    } else {                                                                    \
+        using _Reducer = Reducer<dtype, ctype, comp_type, false>;               \
+        std::function<void(                                                     \
+                const ctype*, ctype*, DType, size_t, size_t, size_t,            \
+                _megdnn_workspace)>                                             \
+                do_reduce = Exec<_Reducer, false>::do_reduce;                   \
+        MIDOUT_BEGIN(                                                           \
+                megdnn_fallback_reduce_optimized, ctype, dtype, comp_type,      \
+                midout_iv(1)) {                                                 \
+            MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce(                             \
+                    reinterpret_cast<ctype*>(src.raw_ptr()),                    \
+                    reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
+                    workspace));                                                \
+            execed = true;                                                      \
+        }                                                                       \
+        MIDOUT_END();                                                           \
     }
 
 #define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type)         \
diff --git a/dnn/src/fallback/reduce/opr_impl.h b/dnn/src/fallback/reduce/opr_impl.h
index 7271ad486..6ac00d137 100644
--- a/dnn/src/fallback/reduce/opr_impl.h
+++ b/dnn/src/fallback/reduce/opr_impl.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "src/common/reduce_helper.h"
 #include "src/naive/reduce/opr_impl.h"
 
 namespace megdnn {
@@ -13,6 +14,8 @@ public:
             _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
     void exec_fallback(
             _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
+    size_t get_workspace_in_bytes(
+            const TensorLayout& src, const TensorLayout& dst) override;
 };
 
 }  // namespace fallback
diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h
index ced89d3f6..f269314e8 100644
--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "src/common/unroll_macro.h"
 #include "src/common/utils.h"
 #include "src/fallback/general_intrinsic/gi_float.h"
 #include "src/fallback/general_intrinsic/gi_int.h"
@@ -395,14 +396,14 @@ template <typename Reducer, bool C1>
 struct Exec {
     static void do_reduce(
             const typename Reducer::ctype* src, typename Reducer::ctype* dst,
-            DType src_dtype, size_t A, size_t B, size_t C);
+            DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace);
 };
 
 template <typename Reducer>
 struct Exec<Reducer, true> {
     static void do_reduce(
             const typename Reducer::ctype* src, typename Reducer::ctype* dst,
-            DType src_dtype, size_t A, size_t B, size_t) {
+            DType src_dtype, size_t A, size_t B, size_t, _megdnn_workspace) {
         size_t a = 0;
         for (; a < A; a++) {
             Reducer reducer0(src_dtype, B);
@@ -426,7 +427,7 @@ template <typename Reducer>
 struct Exec<Reducer, false> {
     static void do_reduce(
             const typename Reducer::ctype* src, typename Reducer::ctype* dst,
-            DType src_dtype, size_t A, size_t B, size_t C) {
+            DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace) {
         for (size_t a = 0; a < A; a++) {
             size_t c = 0;
             for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
@@ -448,10 +449,276 @@ struct Exec<Reducer, false> {
     }
 };
 
+// function kern_4x15xT()
+// 1. Loop the calculation with SIMD_WIDTH x 15 x T as a set of data
+// 2. T affects accuracy, i.e. SIMD_ Width x 15 x T data accumulated into SIMD_ Width
+// data 3.D0-d14 is used for reading, then bisection and addition, the addition result
+// is stored in D15, and D15 is written once in T cycles
+
+// function kern_4xXXx1()
+// Enter this function when the remaining number is less than 60
+// 1. The first switch is to gather the redundant numbers at the end into a vector,
+// which can be processed in vector units in subsequent processes
+// 2. The second switch loads multiple vectors
+// 3. The third switch, binary calculation, results in a vector
+#define ImplementC1LargeB(rd_type, coef, case_load, load, for_shift, cal_final_res)    \
+    template <>                                                                        \
+    struct Exec<rd_type##Reducer<dt_float32, float, float, true>, true> {              \
+        using rd_type##Reducer_ = rd_type##Reducer<dt_float32, float, float, true>;    \
+        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);            \
+        static constexpr int VREG_NUM = 16;                                            \
+        static void kern_4x15xT(                                                       \
+                const float* read_ptr, size_t& read_idx, float* write_ptr,             \
+                size_t& write_idx, size_t remain_size, size_t T) {                     \
+            GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13,   \
+                    d14, d15;                                                          \
+            constexpr size_t STEP = SIMD_WIDTH * (VREG_NUM - 1);                       \
+            while (read_idx + STEP <= remain_size) {                                   \
+                d15 = GiBroadcastFloat32(0.0);                                         \
+                size_t i = 0;                                                          \
+                for (; read_idx + STEP <= remain_size && i < T;                        \
+                     read_idx += STEP, i++) {                                          \
+                    const float* _read_ptr = read_ptr + read_idx;                      \
+                    UNROLL_CALL_RAW(15, load, _read_ptr, read_ptr, write_ptr)          \
+                    d0 = GiAddFloat32(d0, d1);                                         \
+                    d2 = GiAddFloat32(d2, d3);                                         \
+                    d4 = GiAddFloat32(d4, d5);                                         \
+                    d6 = GiAddFloat32(d6, d7);                                         \
+                    d8 = GiAddFloat32(d8, d9);                                         \
+                    d10 = GiAddFloat32(d10, d11);                                      \
+                    d12 = GiAddFloat32(d12, d13);                                      \
+                    d0 = GiAddFloat32(d0, d2);                                         \
+                    d4 = GiAddFloat32(d4, d6);                                         \
+                    d8 = GiAddFloat32(d8, d10);                                        \
+                    d12 = GiAddFloat32(d12, d14);                                      \
+                    d0 = GiAddFloat32(d0, d4);                                         \
+                    d8 = GiAddFloat32(d8, d12);                                        \
+                    d0 = GiAddFloat32(d0, d8);                                         \
+                    d15 = GiAddFloat32(d0, d15);                                       \
+                }                                                                      \
+                GiStoreFloat32(write_ptr + write_idx, d15);                            \
+                write_idx += SIMD_WIDTH;                                               \
+            }                                                                          \
+        }                                                                              \
+        static void kern_4xXXx1(                                                       \
+                const float* read_ptr, size_t& read_idx, float* write_ptr,             \
+                size_t& write_idx, size_t remain_size) {                               \
+            size_t block_num = remain_size / SIMD_WIDTH;                               \
+            size_t tail_num = remain_size % SIMD_WIDTH;                                \
+            if (block_num == 0) {                                                      \
+                for_shift(read_ptr, read_idx, write_ptr, write_idx, tail_num);         \
+                write_idx += tail_num;                                                 \
+            } else {                                                                   \
+                GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12,    \
+                        d13, d14, d15;                                                 \
+                float buf[4];                                                          \
+                switch (tail_num) {                                                    \
+                    case 3:                                                            \
+                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
+                        buf[1] = read_ptr[read_idx + remain_size - 2];                 \
+                        buf[2] = read_ptr[read_idx + remain_size - 3];                 \
+                        buf[3] = 0;                                                    \
+                        load(0, buf, read_ptr, write_ptr);                             \
+                        break;                                                         \
+                    case 2:                                                            \
+                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
+                        buf[1] = read_ptr[read_idx + remain_size - 2];                 \
+                        buf[2] = 0;                                                    \
+                        buf[3] = 0;                                                    \
+                        load(0, buf, read_ptr, write_ptr);                             \
+                        break;                                                         \
+                    case 1:                                                            \
+                        buf[0] = read_ptr[read_idx + remain_size - 1];                 \
+                        buf[1] = 0;                                                    \
+                        buf[2] = 0;                                                    \
+                        buf[3] = 0;                                                    \
+                        load(0, buf, read_ptr, write_ptr);                             \
+                        break;                                                         \
+                    default:                                                           \
+                        d0 = GiBroadcastFloat32(0.0);                                  \
+                        break;                                                         \
+                }                                                                      \
+                d15 = d0;                                                              \
+                remain_size -= tail_num;                                               \
+                const float* _read_ptr = read_ptr + read_idx;                          \
+                switch (block_num) {                                                   \
+                    case_load(15, _read_ptr, 14, read_ptr, write_ptr);                 \
+                    case_load(14, _read_ptr, 13, read_ptr, write_ptr);                 \
+                    case_load(13, _read_ptr, 12, read_ptr, write_ptr);                 \
+                    case_load(12, _read_ptr, 11, read_ptr, write_ptr);                 \
+                    case_load(11, _read_ptr, 10, read_ptr, write_ptr);                 \
+                    case_load(10, _read_ptr, 9, read_ptr, write_ptr);                  \
+                    case_load(9, _read_ptr, 8, read_ptr, write_ptr);                   \
+                    case_load(8, _read_ptr, 7, read_ptr, write_ptr);                   \
+                    case_load(7, _read_ptr, 6, read_ptr, write_ptr);                   \
+                    case_load(6, _read_ptr, 5, read_ptr, write_ptr);                   \
+                    case_load(5, _read_ptr, 4, read_ptr, write_ptr);                   \
+                    case_load(4, _read_ptr, 3, read_ptr, write_ptr);                   \
+                    case_load(3, _read_ptr, 2, read_ptr, write_ptr);                   \
+                    case_load(2, _read_ptr, 1, read_ptr, write_ptr);                   \
+                    case_load(1, _read_ptr, 0, read_ptr, write_ptr);                   \
+                    default:                                                           \
+                        break;                                                         \
+                }                                                                      \
+                d0 = GiAddFloat32(d0, d15);                                            \
+                while (block_num > 1) {                                                \
+                    switch (block_num) {                                               \
+                        case 15:                                                       \
+                        case 14:                                                       \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            d2 = GiAddFloat32(d4, d5);                                 \
+                            d3 = GiAddFloat32(d6, d7);                                 \
+                            d4 = GiAddFloat32(d8, d9);                                 \
+                            d5 = GiAddFloat32(d10, d11);                               \
+                            d6 = GiAddFloat32(d12, d13);                               \
+                            if (block_num & 1)                                         \
+                                d7 = d14;                                              \
+                            break;                                                     \
+                        case 13:                                                       \
+                        case 12:                                                       \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            d2 = GiAddFloat32(d4, d5);                                 \
+                            d3 = GiAddFloat32(d6, d7);                                 \
+                            d4 = GiAddFloat32(d8, d9);                                 \
+                            d5 = GiAddFloat32(d10, d11);                               \
+                            if (block_num & 1)                                         \
+                                d6 = d12;                                              \
+                            break;                                                     \
+                        case 11:                                                       \
+                        case 10:                                                       \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            d2 = GiAddFloat32(d4, d5);                                 \
+                            d3 = GiAddFloat32(d6, d7);                                 \
+                            d4 = GiAddFloat32(d8, d9);                                 \
+                            if (block_num & 1)                                         \
+                                d5 = d10;                                              \
+                            break;                                                     \
+                        case 9:                                                        \
+                        case 8:                                                        \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            d2 = GiAddFloat32(d4, d5);                                 \
+                            d3 = GiAddFloat32(d6, d7);                                 \
+                            if (block_num & 1)                                         \
+                                d4 = d8;                                               \
+                            break;                                                     \
+                        case 7:                                                        \
+                        case 6:                                                        \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            d2 = GiAddFloat32(d4, d5);                                 \
+                            if (block_num & 1)                                         \
+                                d3 = d6;                                               \
+                            break;                                                     \
+                        case 5:                                                        \
+                        case 4:                                                        \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            d1 = GiAddFloat32(d2, d3);                                 \
+                            if (block_num & 1)                                         \
+                                d2 = d4;                                               \
+                            break;                                                     \
+                        case 3:                                                        \
+                        case 2:                                                        \
+                            d0 = GiAddFloat32(d0, d1);                                 \
+                            if (block_num & 1)                                         \
+                                d1 = d2;                                               \
+                        default:                                                       \
+                            break;                                                     \
+                    }                                                                  \
+                    block_num = (block_num + 1) / 2;                                   \
+                }                                                                      \
+                GiStoreFloat32(write_ptr + write_idx, d0);                             \
+                write_idx += SIMD_WIDTH;                                               \
+            }                                                                          \
+        }                                                                              \
+        static void do_reduce(                                                         \
+                const float* src, float* dst, DType src_dtype, size_t A, size_t B,     \
+                size_t, _megdnn_workspace workspace) {                                 \
+            MEGDNN_MARK_USED_VAR(src_dtype);                                           \
+            float* workspace_ptr = workspace.raw_ptr->as<float>();                     \
+            constexpr size_t T = 3;                                                    \
+            for (size_t a = 0; a < A; a++) {                                           \
+                size_t remain_size = B;                                                \
+                const float* read_ptr = src + a * B;                                   \
+                float* write_ptr = workspace_ptr;                                      \
+                while (remain_size > SIMD_WIDTH) {                                     \
+                    size_t read_idx = 0;                                               \
+                    size_t write_idx = 0;                                              \
+                    kern_4x15xT(                                                       \
+                            read_ptr, read_idx, write_ptr, write_idx, remain_size, T); \
+                    kern_4xXXx1(                                                       \
+                            read_ptr, read_idx, write_ptr, write_idx,                  \
+                            remain_size - read_idx);                                   \
+                    remain_size = write_idx;                                           \
+                    read_ptr = workspace_ptr;                                          \
+                }                                                                      \
+                cal_final_res(remain_size, read_ptr, write_ptr, dst, coef);            \
+                dst++;                                                                 \
+            }                                                                          \
+        }                                                                              \
+    };
+
+#define GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \
+    d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT);
+#define GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR)     \
+    d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT); \
+    if (RD_PTR != WR_PTR)                                 \
+        d##SHIFT = GiMultiplyFloat32(d##SHIFT, d##SHIFT);
+
+#define CASE_GI_LOAD(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
+    case NUM:                                         \
+        GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR)           \
+        MEGDNN_FALLTHRU
+#define CASE_GI_LOAD_THEN_MULT(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
+    case NUM:                                                   \
+        GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR)           \
+        MEGDNN_FALLTHRU
+
+#define FOR_MEAN_AND_SUM(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \
+    for (size_t i = 0; i < tail_num; i++)                          \
+        wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];
+#define FOR_SUM_SQUARE(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num)          \
+    if (rd_ptr != wr_ptr)                                                 \
+        for (size_t i = 0; i < tail_num; i++)                             \
+            wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i] * rd_ptr[rd_idx + i]; \
+    else                                                                  \
+        for (size_t i = 0; i < tail_num; i++)                             \
+            wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];
+
+#define CAL_FINAL_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
+    float val = 0;                                                        \
+    if (write_ptr != read_ptr)                                            \
+        for (size_t i = 0; i < remain_size; i++)                          \
+            val = val + read_ptr[i];                                      \
+    else                                                                  \
+        for (size_t i = 0; i < remain_size; i++)                          \
+            val = val + write_ptr[i];                                     \
+    *dst_ptr = val * coef;
+#define CAL_FINAL_SQUARE_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
+    float val = 0;                                                               \
+    if (write_ptr != read_ptr)                                                   \
+        for (size_t i = 0; i < remain_size; i++)                                 \
+            val = val + read_ptr[i] * read_ptr[i];                               \
+    else                                                                         \
+        for (size_t i = 0; i < remain_size; i++)                                 \
+            val = val + write_ptr[i];                                            \
+    *dst_ptr = val * coef;
+
+ImplementC1LargeB(
+        Mean, 1 / B, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
+ImplementC1LargeB(Sum, 1, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
+ImplementC1LargeB(
+        SumSqr, 1, CASE_GI_LOAD_THEN_MULT, GI_LOAD_THEN_MULT, FOR_SUM_SQUARE,
+        CAL_FINAL_SQUARE_RESULT);
+
 template <typename Reducer, typename dtype, size_t B>
 struct ExecC1SmallB {
     static void do_reduce(
-            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
+            const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C,
+            _megdnn_workspace);
 };
 
 #define ImplementC1SmallB(_ctype, _gi_type, _gi_ins)                                 \
@@ -459,7 +726,7 @@ struct ExecC1SmallB {
     struct ExecC1SmallB<Reducer, _ctype, B> {                                        \
         static void do_reduce(                                                       \
                 const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t,   \
-                size_t) {                                                            \
+                size_t, _megdnn_workspace) {                                         \
             size_t a = 0;                                                            \
             for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) {          \
                 Reducer reducer(src_dtype, B);                                       \
diff --git a/dnn/test/fallback/reduce.cpp b/dnn/test/fallback/reduce.cpp
index 02aeb9640..d452e5dea 100644
--- a/dnn/test/fallback/reduce.cpp
+++ b/dnn/test/fallback/reduce.cpp
@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
     };
     run();
 }
+
+TEST_F(FALLBACK, BENCHMARK_REDUCE) {
+    auto run = [&]() {
+        Benchmarker<Reduce> benchmarker_reduce(handle());
+        benchmarker_reduce.set_display(false);
+        using Mode = param::Reduce::Mode;
+
+        constexpr size_t RUNS = 100;
+        benchmarker_reduce.set_times(RUNS);
+
+        TensorShape small{3 * 224 * 224};
+        TensorShape large{3 * 224 * 224 * 100};
+        param::Reduce param;
+        param.axis = 0;
+
+        for (auto i = 224; i < 224 * 2; i++) {
+            for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
+                param.mode = mode;
+                benchmarker_reduce.set_param(param);
+                auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
+            }
+        }
+        param.mode = param::Reduce::Mode::SUM;
+        benchmarker_reduce.set_param(param);
+        printf("SUM\n");
+        {
+            TensorLayout src(small, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 1: reduce use time %fms\n", reduce);
+        }
+        {
+            TensorLayout src(large, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 1: reduce use time %fms\n", reduce);
+        }
+
+        param.mode = param::Reduce::Mode::MEAN;
+        benchmarker_reduce.set_param(param);
+        printf("MEAN\n");
+        {
+            TensorLayout src(small, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 2: reduce use time %fms\n", reduce);
+        }
+        {
+            TensorLayout src(large, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 2: reduce use time %fms\n", reduce);
+        }
+
+        param.mode = param::Reduce::Mode::SUM_SQR;
+        benchmarker_reduce.set_param(param);
+        printf("SUM_SQR\n");
+        {
+            TensorLayout src(small, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 3: reduce use time %fms\n", reduce);
+        }
+        {
+            TensorLayout src(large, dtype::Float32());
+            auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;
+
+            printf("case 3: reduce use time %fms\n", reduce);
+        }
+    };
+    run();
+}
 #endif
 
 // vim: syntax=cpp.doxygen
-- 
GitLab