fix(fallback): reman general intrinsic type and add more intrinsic

GitOrigin-RevId: 37409bae9a9a0e8c324f546e2ed985566cca18c7

fix(fallback): reman general intrinsic type and add more intrinsic
GitOrigin-RevId: 37409bae9a9a0e8c324f546e2ed985566cca18c7
a017bed3 · Megvii Engine Team · fd6f8e58 · a017bed3 · a017bed3 · a017bed3
4 changed file
--- a/dnn/src/fallback/general_intrinsic/gi_common.h
+++ b/dnn/src/fallback/general_intrinsic/gi_common.h
@@ -82,29 +82,33 @@

 #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
        defined(GI_FMA_INTRINSICS)
-typedef __m256 GI_FLOAT32;
-typedef __m256i GI_UINT8;
-typedef __m256i GI_INT8;
-typedef __m256i GI_INT16;
-typedef __m256i GI_INT32;
+typedef __m256 GI_FLOAT32_t;
+typedef __m256i GI_UINT8_t;
+typedef __m256i GI_INT8_t;
+typedef __m256i GI_INT16_t;
+typedef __m256i GI_INT32_t;
+typedef __m256i GI_UINT32_t;
 #elif defined(GI_NEON_INTRINSICS)
-typedef float32x4_t GI_FLOAT32;
-typedef uint8x16_t GI_UINT8;
-typedef int8x16_t GI_INT8;
-typedef int16x8_t GI_INT16;
-typedef int32x4_t GI_INT32;
+typedef float32x4_t GI_FLOAT32_t;
+typedef uint8x16_t GI_UINT8_t;
+typedef int8x16_t GI_INT8_t;
+typedef int16x8_t GI_INT16_t;
+typedef int32x4_t GI_INT32_t;
+typedef uint32x4_t GI_UINT32_t;
 #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
-typedef __m128 GI_FLOAT32;
-typedef __m128i GI_UINT8;
-typedef __m128i GI_INT8;
-typedef __m128i GI_INT16;
-typedef __m128i GI_INT32;
+typedef __m128 GI_FLOAT32_t;
+typedef __m128i GI_UINT8_t;
+typedef __m128i GI_INT8_t;
+typedef __m128i GI_INT16_t;
+typedef __m128i GI_INT32_t;
+typedef __m128i GI_UINT32_t;
 #else
-typedef float GI_FLOAT32 __attribute__((vector_size(16)));
-typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
-typedef int8_t GI_INT8 __attribute__((vector_size(16)));
-typedef int16_t GI_INT16 __attribute__((vector_size(16)));
-typedef int32_t GI_INT32 __attribute__((vector_size(16)));
+typedef float GI_FLOAT32_t __attribute__((vector_size(16)));
+typedef uint8_t GI_UINT8_t __attribute__((vector_size(16)));
+typedef int8_t GI_INT8_t __attribute__((vector_size(16)));
+typedef int16_t GI_INT16_t __attribute__((vector_size(16)));
+typedef int32_t GI_INT32_t __attribute__((vector_size(16)));
+typedef uint32_t GI_UINT32_t __attribute__((vector_size(16)));
 #endif

 //! general intrinsic support dynamic length simd, if avx or avx2 the simd
@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
 #define Min(a, b) (a) < (b) ? (a) : (b)

 typedef struct {
-    GI_INT32 val[2];
-} GI_INT32_V2;
+    GI_INT32_t val[2];
+} GI_INT32_V2_t;

 typedef struct {
-    GI_INT32 val[4];
-} GI_INT32_V4;
+    GI_INT32_t val[4];
+} GI_INT32_V4_t;

 typedef struct {
-    GI_FLOAT32 val[2];
-} GI_FLOAT32_V2;
+    GI_FLOAT32_t val[2];
+} GI_FLOAT32_V2_t;

 typedef struct {
-    GI_FLOAT32 val[4];
-} GI_FLOAT32_V4;
+    GI_FLOAT32_t val[4];
+} GI_FLOAT32_V4_t;
+
+typedef struct {
+    GI_INT16_t val[2];
+} GI_INT16_V2_t;
+
+typedef struct {
+    GI_INT8_t val[2];
+} GI_INT8_V2_t;

 GI_FORCEINLINE
-GI_INT32
-GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
+GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vandq_s32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
 }

 GI_FORCEINLINE
-GI_INT32
-GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
+GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return vorrq_s32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)
@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
 }

 GI_FORCEINLINE
-GI_INT32
-GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
+GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) {
 #if defined(GI_NEON_INTRINSICS)
    return vandq_s32(vmvnq_s32(VectorNot), Vector);
 #elif defined(GI_SSE2_INTRINSICS)
@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
 }

 GI_FORCEINLINE
-GI_INT32
-GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
+GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
 #if defined(GI_NEON_INTRINSICS)
    return veorq_s32(Vector1, Vector2);
 #elif defined(GI_SSE2_INTRINSICS)

--- a/dnn/src/fallback/general_intrinsic/gi_float.h
+++ b/dnn/src/fallback/general_intrinsic/gi_float.h
--- a/dnn/src/fallback/general_intrinsic/gi_int.h
+++ b/dnn/src/fallback/general_intrinsic/gi_int.h
--- a/dnn/src/fallback/reduce/reducer.h
+++ b/dnn/src/fallback/reduce/reducer.h
@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
    using ctype = int8_t;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);

-    GI_INT32 res[4];
+    GI_INT32_t res[4];
    int32_t remain;
    int32_t cnt;
    float coef;
-    GI_FLOAT32 vcoef;
+    GI_FLOAT32_t vcoef;
    MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
        memset(res, 0, sizeof(res));
        vcoef = GiBroadcastFloat32(coef);
    }
    MeanReducer() = default;
    void feed(const int8_t* val) {
-        const GI_INT8 vval = GiLoadInt8(val);
-        const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
-        const GI_INT16 vval_high = GiMoveHighLongInt8(vval);
+        const GI_INT8_t vval = GiLoadInt8(val);
+        const GI_INT16_t vval_low = GiMoveLowLongInt8(vval);
+        const GI_INT16_t vval_high = GiMoveHighLongInt8(vval);

-        const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
-        const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
-        const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
-        const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);
+        const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low);
+        const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low);
+        const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high);
+        const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high);

        res[0] = GiAddInt32(res[0], vval_low_low);
        res[1] = GiAddInt32(res[1], vval_low_high);
@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
    void feed_remain(const int8_t* val) { remain += *val; }
    void post(int8_t* dst) {
        for (int i = 0; i < 4; i += 2) {
-            GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
-            GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
+            GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
+            GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
            GiStoreLowInt8(
-                    dst,
-                    (QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
+                    dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(
+                                 {{vitem0, vitem1}})));
            dst += 8;
        }
    }
@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

-    GI_FLOAT32 res;
+    GI_FLOAT32_t res;
    float result;
    float coef;
    MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

-    GI_FLOAT32 res;
+    GI_FLOAT32_t res;
    float remain;
    float coef;
    MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
@@ -140,30 +140,33 @@ struct minReducer;
    struct _mode##Reducer<dt_float32, float, float, true> {                 \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
-        GI_FLOAT32 res;                                                     \
+        GI_FLOAT32_t res;                                                   \
        _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); }  \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
            auto vval = GiLoadFloat32(val);                                 \
-            res = Gi##_Mode##imumFloat32(res, vval);                        \
+            res = Gi##_Mode##NanFloat32(res, vval);                         \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            auto vval = GiBroadcastFloat32(*val);                           \
-            res = Gi##_Mode##imumFloat32(vval, res);                        \
+            res = Gi##_Mode##NanFloat32(vval, res);                         \
        }                                                                   \
-        void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
+        void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); }  \
    }

 REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
 REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
 #undef REDUCER_MAX_MIN_C1

+#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
+#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
+
 #define REDUCER_MAX_MIN_C(_mode, _Mode, _init)                              \
    template <>                                                             \
    struct _mode##Reducer<dt_float32, float, float, false> {                \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
-        GI_FLOAT32 res;                                                     \
+        GI_FLOAT32_t res;                                                   \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
-            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
-            res = Gi##_Mode##imumFloat32(res, vval);                        \
+            GI_FLOAT32_t vval = GiLoadFloat32(val);                         \
+            res = Gi##_Mode##NanFloat32(res, vval);                         \
        }                                                                   \
        void feed_remain(const float* val) {                                \
            using namespace std;                                            \
-            remain = _mode(*val, remain);                                   \
+            remain = _Mode##_NAN(*val, remain);                             \
        }                                                                   \
        void post(float* dst) { GiStoreFloat32(dst, res); }                 \
        void post_remain(float* dst) { *dst = remain; }                     \
@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
 REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
 REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
 #undef REDUCER_MAX_MIN_C
+#undef Max_NAN
+#undef Min_NAN

 #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init)                              \
    template <>                                                              \
    struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> {                  \
        using ctype = int8_t;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
-        GI_INT8 res;                                                         \
+        GI_INT8_t res;                                                       \
        _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); }      \
        _mode##Reducer() = default;                                          \
        void feed(const int8_t* val) {                                       \
-            GI_INT8 vval = GiLoadInt8(val);                                  \
+            GI_INT8_t vval = GiLoadInt8(val);                                \
            res = Gi##_Mode##imumInt8(vval, res);                            \
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
-            GI_INT8 vval = GiBroadcastInt8(*val);                            \
+            GI_INT8_t vval = GiBroadcastInt8(*val);                          \
            res = Gi##_Mode##imumInt8(res, vval);                            \
        }                                                                    \
        void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); }        \
@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
    struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> {                 \
        using ctype = int8_t;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
-        GI_INT8 res;                                                         \
+        GI_INT8_t res;                                                       \
        int8_t remain;                                                       \
        _mode##Reducer(DType, size_t) {                                      \
            res = GiBroadcastInt8(_init);                                    \
@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
        }                                                                    \
        _mode##Reducer() = default;                                          \
        void feed(const int8_t* val) {                                       \
-            GI_INT8 vval = GiLoadInt8(val);                                  \
+            GI_INT8_t vval = GiLoadInt8(val);                                \
            res = Gi##_Mode##imumInt8(res, vval);                            \
        }                                                                    \
        void feed_remain(const int8_t* val) {                                \
@@ -248,7 +253,7 @@ struct ProductReducer;
    struct _mode##Reducer<dt_float32, float, float, true> {                 \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
-        GI_FLOAT32 res;                                                     \
+        GI_FLOAT32_t res;                                                   \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
@@ -256,7 +261,7 @@ struct ProductReducer;
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
-            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
+            GI_FLOAT32_t vval = GiLoadFloat32(val);                         \
            res = Gi##_Mode##Float32(vval, res);                            \
        }                                                                   \
        void feed_remain(const float* val) {                                \
@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
    struct _mode##Reducer<dt_float32, float, float, false> {                \
        using ctype = float;                                                \
        static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
-        GI_FLOAT32 res;                                                     \
+        GI_FLOAT32_t res;                                                   \
        float remain;                                                       \
        _mode##Reducer(DType, size_t) {                                     \
            res = GiBroadcastFloat32(_init);                                \
@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
        }                                                                   \
        _mode##Reducer() = default;                                         \
        void feed(const float* val) {                                       \
-            GI_FLOAT32 vval = GiLoadFloat32(val);                           \
+            GI_FLOAT32_t vval = GiLoadFloat32(val);                         \
            res = Gi##_Mode##Float32(vval, res);                            \
        }                                                                   \
        void feed_remain(const float* val) {                                \
@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

-    GI_FLOAT32 res;
+    GI_FLOAT32_t res;
    float result;
    SumSqrReducer(DType, size_t cnt) : result(0.0f) {
        MEGDNN_MARK_USED_VAR(cnt);
@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
    }
    SumSqrReducer() = default;
    void feed(const float* val) {
-        GI_FLOAT32 vval = GiLoadFloat32(val);
+        GI_FLOAT32_t vval = GiLoadFloat32(val);
        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
    }
    void feed_remain(const float* val) {
@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
    using ctype = float;
    static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);

-    GI_FLOAT32 res;
+    GI_FLOAT32_t res;
    float remain;
    SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
        MEGDNN_MARK_USED_VAR(cnt);
@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
    }
    SumSqrReducer() = default;
    void feed(const float* val) {
-        GI_FLOAT32 vval = GiLoadFloat32(val);
+        GI_FLOAT32_t vval = GiLoadFloat32(val);
        res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
    }
    void feed_remain(const float* val) { remain += (*val) * (*val); }