feat(arm): delete the reduant implement

GitOrigin-RevId: ff32a3dc8b33c264956c64ccf730d363057ac501

feat(arm): delete the reduant implement
GitOrigin-RevId: ff32a3dc8b33c264956c64ccf730d363057ac501
93c7e451 · Megvii Engine Team · e34a642b · 93c7e451 · 93c7e451 · 93c7e451
5 changed file
--- a/dnn/src/arm_common/reduce/opr_impl.cpp
+++ b/dnn/src/arm_common/reduce/opr_impl.cpp
@@ -39,33 +39,6 @@ typedef __fp16 fp16_fix_t;
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct MeanReducer;

-template <>
-struct MeanReducer<dt_qint8, int8_t, int32_t, true> {
-    using ctype = int8_t;
-    static constexpr int SIMD_WIDTH = 16;
-
-    int32_t res;
-    float coef;
-    MeanReducer(DType, size_t cnt) : res(0), coef(1.0 / cnt) {}
-    MeanReducer() = default;
-    void feed(const int8_t* val) {
-#if MEGDNN_AARCH64
-        res += vaddlvq_s8(vld1q_s8(val));
-#elif MEGDNN_ARMV7
-        auto sum = vpaddlq_s16(vpaddlq_s8(vld1q_s8(val)));
-        res += (vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) +
-                vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3));
-#else
-#error "unsupport android arch"
-#endif
-    }
-    void feed_remain(const int8_t* val) { res += *val; }
-    void post(int8_t* dst) {
-        float sum = res * coef;
-        *dst = std::round(sum);
-    }
-};
-
 template <>
 struct MeanReducer<dt_quint8, uint8_t, int32_t, true> {
    using ctype = uint8_t;
@@ -97,33 +70,6 @@ struct MeanReducer<dt_quint8, uint8_t, int32_t, true> {
    }
 };

-template <>
-struct MeanReducer<dt_float32, float, float, true> {
-    using ctype = float;
-    static constexpr int SIMD_WIDTH = 4;
-
-    float32x4_t res;
-    float result;
-    float coef;
-    MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
-        res = vdupq_n_f32(0.0f);
-    }
-    MeanReducer() = default;
-    void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); }
-    void feed_remain(const float* val) { result += *val; }
-    void post(float* dst) {
-#if MEGDNN_AARCH64
-        result += vaddvq_f32(res);
-#elif MEGDNN_ARMV7
-        auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
-        result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1));
-#else
-#error "unsupport android arch"
-#endif
-        *dst = result * coef;
-    }
-};
-
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 struct MeanReducer<__fp16, __fp16, __fp16, true> {
@@ -170,73 +116,6 @@ struct MeanReducer<__fp16, __fp16, __fp16, false> {
 };
 #endif

-template <>
-struct MeanReducer<dt_float32, float, float, false> {
-    using ctype = float;
-    static constexpr int SIMD_WIDTH = 4;
-
-    float32x4_t res;
-    float remain;
-    float coef;
-    MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
-        res = vdupq_n_f32(0.0f);
-    }
-    MeanReducer() = default;
-    void feed(const float* val) { res = vaddq_f32(vld1q_f32(val), res); }
-    void feed_remain(const float* val) { remain += *val; }
-    void post(float* dst) {
-        res = vmulq_n_f32(res, coef);
-        vst1q_f32(dst, res);
-    }
-    void post_remain(float* dst) { *dst = remain * coef; }
-};
-
-template <>
-struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
-    using ctype = int8_t;
-    static constexpr int SIMD_WIDTH = 16;
-
-    int32x4_t res[4];
-    int32_t remain;
-    int32_t cnt;
-    float coef;
-    float32x4_t vcoef;
-    MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
-        memset(res, 0, sizeof(res));
-        vcoef = vdupq_n_f32(coef);
-    }
-    MeanReducer() = default;
-    void feed(const int8_t* val) {
-        const int8x16_t vval = vld1q_s8(val);
-        const int16x8_t vval_low = vmovl_s8(vget_low_s8(vval));
-        const int16x8_t vval_high = vmovl_s8(vget_high_s8(vval));
-
-        const int32x4_t vval_low_low = vmovl_s16(vget_low_s16(vval_low));
-        const int32x4_t vval_low_high = vmovl_s16(vget_high_s16(vval_low));
-        const int32x4_t vval_high_low = vmovl_s16(vget_low_s16(vval_high));
-        const int32x4_t vval_high_high = vmovl_s16(vget_high_s16(vval_high));
-
-        res[0] = vaddq_s32(res[0], vval_low_low);
-        res[1] = vaddq_s32(res[1], vval_low_high);
-        res[2] = vaddq_s32(res[2], vval_high_low);
-        res[3] = vaddq_s32(res[3], vval_high_high);
-    }
-    void feed_remain(const int8_t* val) { remain += *val; }
-    void post(int8_t* dst) {
-        for (int i = 0; i < 4; i += 2) {
-            float32x4_t vitem0 = vmulq_f32(vcvtq_f32_s32(res[i]), vcoef);
-            float32x4_t vitem1 = vmulq_f32(vcvtq_f32_s32(res[i + 1]), vcoef);
-            vst1_s8(dst,
-                    (QConverter::convert<int8x8_t, float32x4x2_t>({{vitem0, vitem1}})));
-            dst += 8;
-        }
-    }
-    void post_remain(int8_t* dst) {
-        float sum = remain * coef;
-        *dst = std::round(sum);
-    }
-};
-
 template <>
 struct MeanReducer<dt_quint8, uint8_t, int32_t, false> {
    using ctype = uint8_t;
@@ -335,8 +214,6 @@ struct minReducer;
        }                                                                              \
    }

-REDUCER_MAX_MIN_C1(max, dt_qint8, int8_t, int8_t, s, int, -128);
-REDUCER_MAX_MIN_C1(min, dt_qint8, int8_t, int8_t, s, int, 127);
 REDUCER_MAX_MIN_C1(max, dt_quint8, uint8_t, uint8_t, u, uint, 0);
 REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
 #undef REDUCER_MAX_MIN_C1
@@ -364,45 +241,10 @@ REDUCER_MAX_MIN_C1(min, dt_quint8, uint8_t, uint8_t, u, uint, 255);
        void post_remain(ctype* dst) { vst1q_lane_##_stype(dst, remain, 0); }        \
    }

-REDUCER_MAX_MIN_C(max, dt_qint8, int8_t, int8_t, s8, int, -128);
-REDUCER_MAX_MIN_C(min, dt_qint8, int8_t, int8_t, s8, int, 127);
 REDUCER_MAX_MIN_C(max, dt_quint8, uint8_t, uint8_t, u8, uint, 0);
 REDUCER_MAX_MIN_C(min, dt_quint8, uint8_t, uint8_t, u8, uint, 255);
 #undef REDUCER_MAX_MIN_C

-#define REDUCER_MAX_MIN_C1(                                                         \
-        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init)            \
-    template <>                                                                     \
-    struct _mode##Reducer<_dtype, _ctype, _comp_type, true> {                       \
-        using ctype = _ctype;                                                       \
-        static constexpr int SIMD_WIDTH = _num;                                     \
-        __stype res;                                                                \
-        _mode##Reducer(DType, size_t) { res = vdupq_n_##_stype(_init); }            \
-        _mode##Reducer() = default;                                                 \
-        void feed(const ctype* val) {                                               \
-            __stype vval = vld1q_##_stype(val);                                     \
-            res = v##_mode##q_##_stype(vval, res);                                  \
-        }                                                                           \
-        void feed_remain(const ctype* val) {                                        \
-            __stype vval = vdupq_n_##_stype(*val);                                  \
-            res = v##_mode##q_##_stype(vval, res);                                  \
-        }                                                                           \
-        void post(ctype* dst) {                                                     \
-            auto val = v##_mode##_##_stype(                                         \
-                    vget_low_##_stype(res), vget_high_##_stype(res));               \
-            using namespace std;                                                    \
-            *dst = _mode({vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1)}); \
-        }                                                                           \
-    }
-
-REDUCER_MAX_MIN_C1(
-        max, dt_float32, float, float, f32, float32x4_t, 4,
-        std::numeric_limits<dt_float32>::lowest());
-REDUCER_MAX_MIN_C1(
-        min, dt_float32, float, float, f32, float32x4_t, 4,
-        std::numeric_limits<dt_float32>::max());
-#undef REDUCER_MAX_MIN_C1
-
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #define REDUCER_MAX_MIN_C1(                                                          \
        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init)             \
@@ -440,38 +282,6 @@ REDUCER_MAX_MIN_C1(
 #undef REDUCER_MAX_MIN_C1
 #endif

-#define REDUCER_MAX_MIN_C(                                               \
-        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init) \
-    template <>                                                          \
-    struct _mode##Reducer<_dtype, _ctype, _comp_type, false> {           \
-        using ctype = _ctype;                                            \
-        static constexpr int SIMD_WIDTH = _num;                          \
-        __stype res;                                                     \
-        ctype remain;                                                    \
-        _mode##Reducer(DType, size_t) {                                  \
-            res = vdupq_n_##_stype(_init);                               \
-            remain = _init;                                              \
-        }                                                                \
-        _mode##Reducer() = default;                                      \
-        void feed(const ctype* val) {                                    \
-            __stype vval = vld1q_##_stype(val);                          \
-            res = v##_mode##q_##_stype(vval, res);                       \
-        }                                                                \
-        void feed_remain(const ctype* val) {                             \
-            using namespace std;                                         \
-            remain = _mode(*val, remain);                                \
-        }                                                                \
-        void post(ctype* dst) { vst1q_##_stype(dst, res); }              \
-        void post_remain(ctype* dst) { *dst = remain; }                  \
-    }
-
-REDUCER_MAX_MIN_C(
-        max, dt_float32, float, float, f32, float32x4_t, 4,
-        std::numeric_limits<dt_float32>::lowest());
-REDUCER_MAX_MIN_C(
-        min, dt_float32, float, float, f32, float32x4_t, 4,
-        std::numeric_limits<dt_float32>::max());
-#undef REDUCER_MAX_MIN_C
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #define REDUCER_MAX_MIN_C(                                                   \
        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init)     \
@@ -513,45 +323,6 @@ struct SumReducer;
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct ProductReducer;

-#define REDUCER_SUM_PRODUCT_C1(                                                     \
-        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
-    template <>                                                                     \
-    struct _mode##Reducer<_dtype, _ctype, _comp_type, true> {                       \
-        using ctype = _ctype;                                                       \
-        static constexpr int SIMD_WIDTH = _num;                                     \
-        __stype res;                                                                \
-        ctype remain;                                                               \
-        _mode##Reducer(DType, size_t) {                                             \
-            res = vdupq_n_##_stype(_init);                                          \
-            remain = _init;                                                         \
-        }                                                                           \
-        _mode##Reducer() = default;                                                 \
-        void feed(const ctype* val) {                                               \
-            __stype vval = vld1q_##_stype(val);                                     \
-            res = v##_act##q_##_stype(vval, res);                                   \
-        }                                                                           \
-        void feed_remain(const ctype* val) {                                        \
-            using namespace std;                                                    \
-            auto op = _op<ctype>();                                                 \
-            remain = op(remain, *val);                                              \
-        }                                                                           \
-        void post(ctype* dst) {                                                     \
-            using namespace std;                                                    \
-            auto val = v##_act##_##_stype(                                          \
-                    vget_low_##_stype(res), vget_high_##_stype(res));               \
-            auto op = _op<ctype>();                                                 \
-            *dst =                                                                  \
-                    op(remain,                                                      \
-                       op(vget_lane_##_stype(val, 0), vget_lane_##_stype(val, 1))); \
-        }                                                                           \
-    }
-
-REDUCER_SUM_PRODUCT_C1(
-        Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus);
-REDUCER_SUM_PRODUCT_C1(
-        Product, dt_float32, float, float, f32, float32x4_t, 4, 1.0f, mul, multiplies);
-#undef REDUCER_SUM_PRODUCT_C1
-
 #define REDUCER_SUM_PRODUCT_C(                                                      \
        _mode, _dtype, _ctype, _comp_type, _stype, __stype, _num, _init, _act, _op) \
    template <>                                                                     \
@@ -578,9 +349,6 @@ REDUCER_SUM_PRODUCT_C1(
        void post_remain(ctype* dst) { *dst = remain; }                             \
    }

-REDUCER_SUM_PRODUCT_C(Sum, dt_float32, float, float, f32, float32x4_t, 4, 0, add, plus);
-REDUCER_SUM_PRODUCT_C(
-        Product, dt_float32, float, float, f32, float32x4_t, 4, 1, mul, multiplies);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 REDUCER_SUM_PRODUCT_C(Sum, __fp16, __fp16, __fp16, f16, float16x8_t, 8, 0, add, plus);
 REDUCER_SUM_PRODUCT_C(
@@ -633,59 +401,6 @@ REDUCER_SUM_PRODUCT_C1(
 template <typename dtype, typename ctype, typename comp_type, bool C1>
 struct SumSqrReducer;

-template <>
-struct SumSqrReducer<dt_float32, float, float, true> {
-    using ctype = float;
-    static constexpr int SIMD_WIDTH = 4;
-
-    float32x4_t res;
-    float result;
-    SumSqrReducer(DType, size_t cnt) : result(0.0f) {
-        MEGDNN_MARK_USED_VAR(cnt);
-        res = vdupq_n_f32(0.0f);
-    }
-    SumSqrReducer() = default;
-    void feed(const float* val) {
-        float32x4_t vval = vld1q_f32(val);
-        res = vaddq_f32(vmulq_f32(vval, vval), res);
-    }
-    void feed_remain(const float* val) {
-        float vval = *val;
-        result += vval * vval;
-    }
-    void post(float* dst) {
-#if MEGDNN_AARCH64
-        result += vaddvq_f32(res);
-#elif MEGDNN_ARMV7
-        auto sum_temp = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
-        result += (vget_lane_f32(sum_temp, 0) + vget_lane_f32(sum_temp, 1));
-#else
-#error "unsupport android arch"
-#endif
-        *dst = result;
-    }
-};
-template <>
-struct SumSqrReducer<dt_float32, float, float, false> {
-    using ctype = float;
-    static constexpr int SIMD_WIDTH = 4;
-
-    float32x4_t res;
-    float remain;
-    SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
-        MEGDNN_MARK_USED_VAR(cnt);
-        res = vdupq_n_f32(0.0f);
-    }
-    SumSqrReducer() = default;
-    void feed(const float* val) {
-        float32x4_t vval = vld1q_f32(val);
-        res = vaddq_f32(vmulq_f32(vval, vval), res);
-    }
-    void feed_remain(const float* val) { remain += (*val) * (*val); }
-    void post(float* dst) { vst1q_f32(dst, res); }
-    void post_remain(float* dst) { *dst = remain; }
-};
-
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 struct SumSqrReducer<__fp16, __fp16, __fp16, true> {
@@ -873,14 +588,12 @@ void ReduceImpl::exec(
        default:                                                 \
            break;                                               \
    }
+
    if (src.layout.is_contiguous() &&
        src.layout.dtype.category() == DTypeCategory::QUANTIZED &&
        param().data_type == param::Reduce::DataType::DEFAULT) {
        DType src_type = src.layout.dtype;

-        if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
-            DISPATCH_MODE_QUANTIZED(dt_qint8, int8_t, int32_t)
-        }
        if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
            DISPATCH_MODE_QUANTIZED(dt_quint8, uint8_t, int32_t)
        }
@@ -889,9 +602,7 @@ void ReduceImpl::exec(
            src.layout.dtype.category() == DTypeCategory::FLOAT &&
            param().data_type == param::Reduce::DataType::DEFAULT) {
        DType src_type = src.layout.dtype;
-        if (src.layout.dtype.enumv() == DTypeEnum::Float32) {
-            DISPATCH_MODE_FLOAT(dt_float32, float, float)
-        }
+        MEGDNN_MARK_USED_VAR(src_type);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (src.layout.dtype.enumv() == DTypeEnum::Float16) {
            DNN_INC_FLOAT16(DISPATCH_MODE_FLOAT(__fp16, __fp16, __fp16));

--- a/dnn/src/fallback/general_intrinsic/gi_common.h
+++ b/dnn/src/fallback/general_intrinsic/gi_common.h
@@ -20,13 +20,19 @@
 #else
 #if defined(__arm__) || defined(__aarch64__)
 #include <arm_neon.h>
-#define GI_TARGET_ARM
 #endif
 #if defined(__x86_64__) || defined(__i386__)
 #include <cpuid.h>
 #include <immintrin.h>
+#endif
+#endif
+
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 #define GI_TARGET_X86
 #endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define GI_TARGET_ARM
 #endif

 #ifdef _WIN32

--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
@@ -454,22 +454,22 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
            GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1));
 }

+#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
+#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
+
 GI_FORCEINLINE
 GI_FLOAT32
 GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vmaxq_f32(Vector1, Vector2);
-#elif defined(GI_SSE2_INTRINSICS)
+#else
    //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
-#define MAX_NAN(a, b) (std::isnan(a) || (a) > (b)) ? (a) : (b);
    GI_FLOAT32 max;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        max[i] = MAX_NAN(Vector1[i], Vector2[i]);
    }
    return max;
-#else
-    return GiBlendFloat32(Vector2, Vector1, Vector1 > Vector2);
 #endif
 }

@@ -478,18 +478,14 @@ GI_FLOAT32
 GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vminq_f32(Vector1, Vector2);
-#elif defined(GI_SSE2_INTRINSICS)
-    return _mm_min_ps(Vector1, Vector2);
+#else
    //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
    //! implement by C code
-#define MIN_NAN(a, b) (std::isnan(a) || (a) < (b)) ? (a) : (b);
    GI_FLOAT32 min;
    for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
        min[i] = MIN_NAN(Vector1[i], Vector2[i]);
    }
    return min;
-#else
-    return GiBlendFloat32(Vector2, Vector1, Vector2 > Vector1);
 #endif
 }

@@ -563,11 +559,6 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
-#elif defined(GI_VSX_INTRINSICS)
-    Vector = GiMaximumFloat32(
-            Vector, GI_FLOAT32(vec_splat((__vector long long)Vector, 1)));
-    Vector = GiMaximumFloat32(Vector, vec_splat(Vector, 1));
-    return Vector[0];
 #elif defined(GI_SSE2_INTRINSICS)
    Vector = GiMaximumFloat32(
            Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
@@ -577,7 +568,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
 #else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
-        ret = Max(ret, Vector[i]);
+        ret = MAX_NAN(ret, Vector[i]);
    }
    return ret;
 #endif
@@ -602,7 +593,7 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
 #else
    float ret = Vector[0];
    for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
-        ret = Min(ret, Vector[i]);
+        ret = MIN_NAN(ret, Vector[i]);
    }
    return ret;
 #endif

--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
@@ -416,7 +416,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
 }

 GI_FORCEINLINE
-int16_t GiReduceAddInt8(GI_INT8 Vector) {
+int32_t GiReduceAddInt8(GI_INT8 Vector) {
 #if defined(GI_NEON64_INTRINSICS)
    return vaddlvq_s8(Vector);
 #elif defined(GI_NEON32_INTRINSICS)
@@ -467,8 +467,10 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
 #elif defined(GI_NEON32_INTRINSICS)
    int8x8_t VectorLow = vget_low_s8(Vector);
    int8x8_t VectorHigh = vget_high_s8(Vector);
-    VectorLow = vpmin_s8(VectorLow, VectorHigh);
-    VectorLow = vpmin_s8(VectorLow, VectorHigh);
+    VectorLow = vpmax_s8(VectorLow, VectorHigh);
+    VectorLow = vpmax_s8(VectorLow, VectorLow);
+    VectorLow = vpmax_s8(VectorLow, VectorLow);
+    VectorLow = vpmax_s8(VectorLow, VectorLow);
    return vget_lane_s8(VectorLow, 0);
 #elif defined(GI_SSE42_INTRINSICS)
    __m128i v0 = _mm_cvtepi8_epi16(Vector);
@@ -514,7 +516,9 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
    int8x8_t VectorLow = vget_low_s8(Vector);
    int8x8_t VectorHigh = vget_high_s8(Vector);
    VectorLow = vpmin_s8(VectorLow, VectorHigh);
-    VectorLow = vpmin_s8(VectorLow, VectorHigh);
+    VectorLow = vpmin_s8(VectorLow, VectorLow);
+    VectorLow = vpmin_s8(VectorLow, VectorLow);
+    VectorLow = vpmin_s8(VectorLow, VectorLow);
    return vget_lane_s8(VectorLow, 0);
 #elif defined(GI_SSE42_INTRINSICS)
    __m128i v0 = _mm_cvtepi8_epi16(Vector);

--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -145,7 +145,7 @@ struct minReducer;
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            auto vval = GiLoadFloat32(val);                                 \
-            res = Gi##_Mode##imumFloat32(vval, res);                        \
+            res = Gi##_Mode##imumFloat32(res, vval);                        \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            auto vval = GiBroadcastFloat32(*val);                           \
@@ -172,7 +172,7 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
-            res = Gi##_Mode##imumFloat32(vval, res);                        \
+            res = Gi##_Mode##imumFloat32(res, vval);                        \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            using namespace std;                                            \
@@ -200,7 +200,7 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
            GI_INT8 vval = GiBroadcastInt8(*val);                            \
-            res = Gi##_Mode##imumInt8(vval, res);                            \
+            res = Gi##_Mode##imumInt8(res, vval);                            \
        }                                                                    \
        void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); }        \
    }
@@ -223,7 +223,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
        _mode##Reducer() = default;                                          \
        void feed(const int8_t* val) {                                       \
            GI_INT8 vval = GiLoadInt8(val);                                  \
-            res = Gi##_Mode##imumInt8(vval, res);                            \
+            res = Gi##_Mode##imumInt8(res, vval);                            \
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
            using namespace std;                                             \