diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 3c5a06c40b49b282736f855c98563eb22d0bdbcb..41dcd1d258d253d39bc556c8df35e0bfe8e000af 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -82,29 +82,33 @@ #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ defined(GI_FMA_INTRINSICS) -typedef __m256 GI_FLOAT32; -typedef __m256i GI_UINT8; -typedef __m256i GI_INT8; -typedef __m256i GI_INT16; -typedef __m256i GI_INT32; +typedef __m256 GI_FLOAT32_t; +typedef __m256i GI_UINT8_t; +typedef __m256i GI_INT8_t; +typedef __m256i GI_INT16_t; +typedef __m256i GI_INT32_t; +typedef __m256i GI_UINT32_t; #elif defined(GI_NEON_INTRINSICS) -typedef float32x4_t GI_FLOAT32; -typedef uint8x16_t GI_UINT8; -typedef int8x16_t GI_INT8; -typedef int16x8_t GI_INT16; -typedef int32x4_t GI_INT32; +typedef float32x4_t GI_FLOAT32_t; +typedef uint8x16_t GI_UINT8_t; +typedef int8x16_t GI_INT8_t; +typedef int16x8_t GI_INT16_t; +typedef int32x4_t GI_INT32_t; +typedef uint32x4_t GI_UINT32_t; #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) -typedef __m128 GI_FLOAT32; -typedef __m128i GI_UINT8; -typedef __m128i GI_INT8; -typedef __m128i GI_INT16; -typedef __m128i GI_INT32; +typedef __m128 GI_FLOAT32_t; +typedef __m128i GI_UINT8_t; +typedef __m128i GI_INT8_t; +typedef __m128i GI_INT16_t; +typedef __m128i GI_INT32_t; +typedef __m128i GI_UINT32_t; #else -typedef float GI_FLOAT32 __attribute__((vector_size(16))); -typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); -typedef int8_t GI_INT8 __attribute__((vector_size(16))); -typedef int16_t GI_INT16 __attribute__((vector_size(16))); -typedef int32_t GI_INT32 __attribute__((vector_size(16))); +typedef float GI_FLOAT32_t __attribute__((vector_size(16))); +typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); +typedef int8_t GI_INT8_t __attribute__((vector_size(16))); +typedef int16_t GI_INT16_t __attribute__((vector_size(16))); +typedef int32_t GI_INT32_t __attribute__((vector_size(16))); +typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); #endif //! general intrinsic support dynamic length simd, if avx or avx2 the simd @@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); #define Min(a, b) (a) < (b) ? (a) : (b) typedef struct { - GI_INT32 val[2]; -} GI_INT32_V2; + GI_INT32_t val[2]; +} GI_INT32_V2_t; typedef struct { - GI_INT32 val[4]; -} GI_INT32_V4; + GI_INT32_t val[4]; +} GI_INT32_V4_t; typedef struct { - GI_FLOAT32 val[2]; -} GI_FLOAT32_V2; + GI_FLOAT32_t val[2]; +} GI_FLOAT32_V2_t; typedef struct { - GI_FLOAT32 val[4]; -} GI_FLOAT32_V4; + GI_FLOAT32_t val[4]; +} GI_FLOAT32_V4_t; + +typedef struct { + GI_INT16_t val[2]; +} GI_INT16_V2_t; + +typedef struct { + GI_INT8_t val[2]; +} GI_INT8_V2_t; GI_FORCEINLINE -GI_INT32 -GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vandq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT32 -GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vorrq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT32 -GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { +GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { #if defined(GI_NEON_INTRINSICS) return vandq_s32(vmvnq_s32(VectorNot), Vector); #elif defined(GI_SSE2_INTRINSICS) @@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { } GI_FORCEINLINE -GI_INT32 -GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return veorq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index e306d6081e8ffe0d9d35433523fe22941534a93c..dc42fb1d75bc0c0e0a00db2c6799191d3c99c109 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -14,20 +14,51 @@ #include "gi_common.h" GI_FORCEINLINE -GI_INT32 -GiReinterpretAsInt32(GI_FLOAT32 In) { +GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { #if defined(GI_NEON_INTRINSICS) return vreinterpretq_s32_f32(In); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(In); #else - return GI_INT32(In); + return *(GI_INT32_t*)(&In); #endif } GI_FORCEINLINE -GI_INT32 -GiRoundAsInt32(GI_FLOAT32 Vector) { +GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_u32_f32(In); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_castps_si128(In); +#else + return *(GI_UINT32_t*)(&In); +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_f32_s32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_castsi128_ps(Vector); +#else + return *(GI_FLOAT32_t*)(&Vector); +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_f32_u32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_castsi128_ps(Vector); +#else + return *(GI_FLOAT32_t*)(&Vector); +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { #if defined(GI_NEON_INTRINSICS) #if __ARM_ARCH >= 8 return vcvtaq_s32_f32(Vector); @@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { return _mm_castps_si128( _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); #else - GI_INT32 ret; + GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = (int32_t)round(Vector[i]); } @@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { } GI_FORCEINLINE -GI_FLOAT32 -GiCastToFloat32(GI_INT32 Vector) { +GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { #if defined(GI_NEON_INTRINSICS) - return vcvtq_f32_s32(Vector); + return vcvtq_s32_f32(Vector); #elif defined(GI_SSE2_INTRINSICS) - return _mm_cvtepi32_ps(Vector); + return _mm_cvttps_epi32(Vector); #else - GI_FLOAT32 ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { - ret[i] = float(Vector[i]); + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = (int32_t)(Vector[i]); } return ret; #endif } GI_FORCEINLINE -GI_FLOAT32 -GiReinterpretAsFloat32(GI_INT32 Vector) { +GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { #if defined(GI_NEON_INTRINSICS) - return vreinterpretq_f32_s32(Vector); + return vcvtq_f32_s32(Vector); #elif defined(GI_SSE2_INTRINSICS) - return _mm_castsi128_ps(Vector); + return _mm_cvtepi32_ps(Vector); #else - return GI_FLOAT32(Vector); + GI_FLOAT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = float(Vector[i]); + } + return ret; #endif } GI_FORCEINLINE -GI_FLOAT32 -GiBroadcastFloat32(float Value) { +GI_FLOAT32_t GiBroadcastFloat32(float Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_f32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_ps(Value); #else - GI_FLOAT32 ret; + GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = Value; } @@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) { } GI_FORCEINLINE -GI_FLOAT32 -GiBroadcastFloat32(const float* Value) { +GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { #if defined(GI_NEON_INTRINSICS) return vld1q_dup_f32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_load_ps1(Value); #else - GI_FLOAT32 ret; + GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = *Value; } @@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) { } GI_FORCEINLINE -GI_FLOAT32 -GiZeroFloat32(void) { +GI_FLOAT32_t GiZeroFloat32(void) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_f32(0.0f); #elif defined(GI_SSE2_INTRINSICS) @@ -128,14 +158,13 @@ GiZeroFloat32(void) { } GI_FORCEINLINE -GI_FLOAT32 -GiLoadFloat32(const float* Buffer) { +GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { #if defined(GI_NEON_INTRINSICS) return vld1q_f32(Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_ps(Buffer); #else - GI_FLOAT32 ret; + GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = Buffer[i]; } @@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) { } GI_FORCEINLINE -void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { +void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1q_f32(Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) @@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { #endif } -GI_FORCEINLINE -void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) { -#if defined(GI_NEON_INTRINSICS) - vst1q_f32(Buffer, Vector); -#elif defined(GI_SSE2_INTRINSICS) - _mm_store_ps(Buffer, Vector); -#else - GiStoreFloat32(Buffer, Vector); -#endif -} - #if defined(GI_NEON_INTRINSICS) -#define GISTORELANEFLOAT32(i) \ - GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ - vst1q_lane_f32(Buffer, Vector, i); \ +#define GISTORELANEFLOAT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ + vst1q_lane_f32(Buffer, Vector, i); \ } #elif defined(GI_SSE2_INTRINSICS) #define GISTORELANEFLOAT32(i) \ - GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ + GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ } #else -#define GISTORELANEFLOAT32(i) \ - GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ - *Buffer = Vector[i]; \ +#define GISTORELANEFLOAT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ + *Buffer = Vector[i]; \ } #endif @@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3) #undef GISTORELANEFLOAT32 #if defined(GI_NEON_INTRINSICS) -#define GIEXTRACTLANEFLOAT32(i) \ - GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ - return vgetq_lane_f32(Vector, i); \ +#define GIEXTRACTLANEFLOAT32(i) \ + GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ + return vgetq_lane_f32(Vector, i); \ } #elif defined(GI_SSE2_INTRINSICS) #define GIEXTRACTLANEFLOAT32(i) \ - GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ + GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ } #else -#define GIEXTRACTLANEFLOAT32(i) \ - GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ - return Vector[i]; \ +#define GIEXTRACTLANEFLOAT32(i) \ + GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ + return Vector[i]; \ } #endif @@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3) #undef GIEXTRACTLANEFLOAT32 GI_FORCEINLINE -GI_FLOAT32 -GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON64_INTRINSICS) return vzip1q_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) @@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #elif defined(GI_SSE2_INTRINSICS) return _mm_unpacklo_ps(Vector1, Vector2); #else - GI_FLOAT32 ret; + GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { ret[2 * i] = Vector1[i]; ret[2 * i + 1] = Vector2[i]; @@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON64_INTRINSICS) return vzip2q_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) @@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { #elif defined(GI_SSE2_INTRINSICS) return _mm_unpackhi_ps(Vector1, Vector2); #else - GI_FLOAT32 ret; + GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; @@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vaddq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vsubq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmulq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { +GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { #if defined(GI_NEON_INTRINSICS) return vmulq_n_f32(Vector1, Scaler); #elif defined(GI_SSE2_INTRINSICS) - GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler); + GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); return _mm_mul_ps(Vector1, Vector2); #else return Vector1 * Scaler; @@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { } GI_FORCEINLINE -GI_FLOAT32 -GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiMultiplyAddFloat32( + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) +#if defined(__ARM_FEATURE_FMA) + return vfmaq_f32(VectorSum, Vector1, Vector2); +#else return vmlaq_f32(VectorSum, Vector1, Vector2); +#endif #elif defined(GI_FMA3_INTRINSICS) return _mm_fmadd_ps(Vector1, Vector2, VectorSum); #elif defined(GI_SSE2_INTRINSICS) @@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec } GI_FORCEINLINE -GI_FLOAT32 -GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) { +GI_FLOAT32_t GiMultiplySubFloat32( + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) - return vmlaq_n_f32(VectorSum, Vector, Scalar); + return vmlsq_f32(VectorSum, Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); +#else + return VectorSum - Vector1 * Vector2; +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiMultiplyAddScalarFloat32( + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) { +#if defined(GI_NEON_INTRINSICS) +#if defined(__ARM_FEATURE_FMA) + return vfmaq_n_f32(VectorSum, Vector, Scalar); +#else + return vfmla_n_f32(VectorSum, Vector, Scalar); +#endif #elif defined(GI_SSE2_INTRINSICS) - return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); + return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); #else return VectorSum + Vector * Scalar; #endif } #if defined(GI_NEON_INTRINSICS) -#define GIMULTIPLYADDLANFLOAT32(i) \ - GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ - GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ - return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ +#if defined(__ARM_FEATURE_FMA) +#define GIMULTIPLYADDLANFLOAT32(i) \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ + return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ } GIMULTIPLYADDLANFLOAT32(0) GIMULTIPLYADDLANFLOAT32(1) #undef GIMULTIPLYADDLANFLOAT32 #define GIMULTIPLYADDLANFLOAT32(i) \ - GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ - GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ + return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ + } +GIMULTIPLYADDLANFLOAT32(2) +GIMULTIPLYADDLANFLOAT32(3) +#else +#define GIMULTIPLYADDLANFLOAT32(i) \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ + return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ + } +GIMULTIPLYADDLANFLOAT32(0) +GIMULTIPLYADDLANFLOAT32(1) +#undef GIMULTIPLYADDLANFLOAT32 +#define GIMULTIPLYADDLANFLOAT32(i) \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ } GIMULTIPLYADDLANFLOAT32(2) GIMULTIPLYADDLANFLOAT32(3) +#endif #undef GIMULTIPLYADDLANFLOAT32 #elif defined(GI_SSE2_INTRINSICS) -#define GIMULTIPLYADDLANFLOAT32(i) \ - GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ - GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ - return GiMultiplyAddScalarFloat32( \ - VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ +#define GIMULTIPLYADDLANFLOAT32(i) \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ + return GiMultiplyAddScalarFloat32( \ + VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ } GIMULTIPLYADDLANFLOAT32(0) GIMULTIPLYADDLANFLOAT32(1) @@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2) GIMULTIPLYADDLANFLOAT32(3) #undef GIMULTIPLYADDLANFLOAT32 #else -#define GIMULTIPLYADDLANFLOAT32(i) \ - GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ - GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ - return VectorSum + Vector1 * Vector2[i]; \ +#define GIMULTIPLYADDLANFLOAT32(i) \ + GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ + GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ + return VectorSum + Vector1 * Vector2[i]; \ } GIMULTIPLYADDLANFLOAT32(0) GIMULTIPLYADDLANFLOAT32(1) @@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3) #endif GI_FORCEINLINE -GI_FLOAT32 -GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON64_INTRINSICS) return vdivq_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) @@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON64_INTRINSICS) + return vrecpsq_f32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + GI_FLOAT32_t two = _mm_set1_ps(2.0f); + return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); +#else + return (2.0f - Vector1 * Vector2); +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { +#if defined(GI_NEON32_INTRINSICS) + return vrecpeq_f32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + GI_FLOAT32_t ones = _mm_set1_ps(1.0f); + return _mm_div_ps(ones, Vector); +#else + return 1 / Vector; +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { +#if defined(GI_NEON32_INTRINSICS) + return vnegq_f32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + GI_FLOAT32_t zero = _mm_set1_ps(0.0f); + return _mm_sub_ps(zero, Vector); +#else + return -Vector; +#endif +} + +GI_FORCEINLINE +GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vcgtq_f32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); +#else + GI_UINT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) - return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); + return vcleq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) - return _mm_cmpgt_ps(Vector1, Vector2); + return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); #else - return Vector1 > Vector2; + GI_UINT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0; + } + return ret; #endif } GI_FORCEINLINE -GI_FLOAT32 -GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vcltq_f32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); +#else + GI_UINT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_SSE2_INTRINSICS) return _mm_and_ps(Vector1, Vector2); #else - return GiReinterpretAsFloat32( + return GiReintInt32ToFloat32( GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); #endif } GI_FORCEINLINE -GI_FLOAT32 -GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_SSE2_INTRINSICS) return _mm_or_ps(Vector1, Vector2); #else - return GiReinterpretAsFloat32( + return GiReintInt32ToFloat32( GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); #endif } GI_FORCEINLINE -GI_FLOAT32 -GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) { +GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) { #if defined(GI_SSE2_INTRINSICS) return _mm_andnot_ps(VectorNot, Vector); #else - return GiReinterpretAsFloat32(GiAndNotInt32( + return GiReintInt32ToFloat32(GiAndNotInt32( GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); #endif } GI_FORCEINLINE -GI_FLOAT32 -GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_SSE2_INTRINSICS) return _mm_xor_ps(Vector1, Vector2); #else - return GiReinterpretAsFloat32( + return GiReintInt32ToFloat32( GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); #endif } GI_FORCEINLINE -GI_FLOAT32 -GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { +GI_FLOAT32_t GiBlendFloat32( + GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) { return GiOrFloat32( GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); } @@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); GI_FORCEINLINE -GI_FLOAT32 -GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiBSLFloat32( + GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vbslq_f32(Selection, Vector1, Vector2); +#else + return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmaxq_f32(Vector1, Vector2); +#elif defined(GI_NEON32_INTRINSICS) + return _mm_max_ps(Vector1, Vector2); +#else + GI_FLOAT32_t max; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + max[i] = Max(Vector1[i], Vector2[i]); + } + return max; +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vminq_f32(Vector1, Vector2); +#elif defined(GI_NEON32_INTRINSICS) + return _mm_min_ps(Vector1, Vector2); +#else + GI_FLOAT32_t min; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + min[i] = Min(Vector1[i], Vector2[i]); + } + return min; +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_f32(Vector1, Vector2); #else //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code - GI_FLOAT32 max; +#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); + GI_FLOAT32_t max; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { max[i] = MAX_NAN(Vector1[i], Vector2[i]); } @@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { +GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_f32(Vector1, Vector2); #else //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code - GI_FLOAT32 min; +#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); + GI_FLOAT32_t min; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { min[i] = MIN_NAN(Vector1[i], Vector2[i]); } @@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { } GI_FORCEINLINE -GI_FLOAT32 -GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) { +GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) { Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); return Value; } GI_FORCEINLINE -float GiReduceAddFloat32(GI_FLOAT32 Vector) { +float GiReduceAddFloat32(GI_FLOAT32_t Vector) { #if defined(GI_NEON64_INTRINSICS) Vector = vpaddq_f32(Vector, Vector); Vector = vpaddq_f32(Vector, Vector); @@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) { } GI_FORCEINLINE -float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { +float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { #if defined(GI_NEON64_INTRINSICS) float32x2_t low = vget_low_f32(Vector); float32x2_t high = vget_high_f32(Vector); @@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { #define Min(a, b) (a) < (b) ? (a) : (b) GI_FORCEINLINE -float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { +float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { #if defined(GI_NEON64_INTRINSICS) return vmaxvq_f32(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { VectorLow = vpmax_f32(VectorLow, VectorHigh); return vget_lane_f32(VectorLow, 0); #elif defined(GI_SSE2_INTRINSICS) - Vector = GiMaximumFloat32( + Vector = GiMaxNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); - Vector = GiMaximumFloat32( + Vector = GiMaxNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); #else @@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { } GI_FORCEINLINE -float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { +float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { #if defined(GI_NEON64_INTRINSICS) return vminvq_f32(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { VectorLow = vpmin_f32(VectorLow, VectorHigh); return vget_lane_f32(VectorLow, 0); #elif defined(GI_SSE2_INTRINSICS) - Vector = GiMinimumFloat32( + Vector = GiMinNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); - Vector = GiMinimumFloat32( + Vector = GiMinNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); #else @@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { #endif } +GI_FORCEINLINE +GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { +#if defined(GI_NEON64_INTRINSICS) + return vabsq_f32(Vector1); +#elif defined(GI_SSE2_INTRINSICS) + union { + unsigned int int_val; + float float_val; + } value; + value.int_val = 0x7fffffff; + return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); +#else + GI_FLOAT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i]; + } + return ret; +#endif +} + // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index 8749c5dbfb22fe5f34a28b40f3c5572c7e46d76f..2da8283a9e7af31869db7f635080fa90d727dcd5 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -14,14 +14,13 @@ #include "gi_common.h" GI_FORCEINLINE -GI_INT32 -GiBroadcastInt32(int32_t Value) { +GI_INT32_t GiBroadcastInt32(int32_t Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_s32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi32(Value); #else - GI_INT32 ret; + GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { ret[i] = Value; } @@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) { } GI_FORCEINLINE -GI_INT8 -GiBroadcastInt8(int8_t Value) { +GI_UINT32_t GiBroadcastUint32(int32_t Value) { +#if defined(GI_NEON_INTRINSICS) + return vdupq_n_u32(Value); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_set1_epi32(Value); +#else + GI_UINT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Value; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiBroadcastInt8(int8_t Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_s8(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi8(Value); #else - GI_INT8 ret; + GI_INT8_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { ret[i] = Value; } @@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) { } GI_FORCEINLINE -GI_INT32 -GiLoadInt32(const int32_t* Buffer) { +GI_INT32_t GiLoadInt32(const int32_t* Buffer) { #if defined(GI_NEON_INTRINSICS) return vld1q_s32(Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_si128((const __m128i*)Buffer); #else - GI_INT32 ret; + GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { ret[i] = Buffer[i]; } @@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) { } GI_FORCEINLINE -GI_INT8 -GiLoadInt8(const int8_t* Buffer) { +GI_INT8_t GiLoadInt8(const int8_t* Buffer) { #if defined(GI_NEON_INTRINSICS) return vld1q_s8(Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_si128((const __m128i*)Buffer); #else - GI_INT8 ret; + GI_INT8_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { ret[i] = Buffer[i]; } @@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) { } GI_FORCEINLINE -void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { +void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1q_s32(Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) @@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { #endif } +#if defined(GI_NEON_INTRINSICS) +#define GISTORELANEINT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ + vst1q_lane_s32(Buffer, Vector, i); \ + } + +#elif defined(GI_SSE2_INTRINSICS) + +#define GISTORELANEINT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ + GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \ + _mm_store_ss( \ + (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ + } +#else +#define GISTORELANEINT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ + *Buffer = Vector[i]; \ + } +#endif + +GISTORELANEINT32(0) +GISTORELANEINT32(1) +GISTORELANEINT32(2) +GISTORELANEINT32(3) + +#undef GISTORELANEFLOAT32 + +GI_FORCEINLINE +GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s8_s32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + return Vector; +#else + return *(GI_INT8_t*)&Vector; +#endif +} + +GI_FORCEINLINE +void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) { +#if defined(GI_NEON_INTRINSICS) + vst1q_s16(Buffer, Vector); +#elif defined(GI_SSE2_INTRINSICS) + _mm_storeu_si128((__m128i*)Buffer, Vector); +#else + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + Buffer[i] = Vector[i]; + } +#endif +} + GI_FORCEINLINE -void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { +void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1q_s8(Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) @@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { } GI_FORCEINLINE -void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { +void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1_s8(Buffer, vget_low_s8(Vector)); #elif defined(GI_SSE2_INTRINSICS) @@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { } GI_FORCEINLINE -void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { +void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1_s8(Buffer, vget_high_s8(Vector)); #elif defined(GI_SSE2_INTRINSICS) @@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { } GI_FORCEINLINE -GI_INT32 -GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiNegInt32(GI_INT32_t Vector) { +#if defined(GI_NEON32_INTRINSICS) + return vnegq_s32(Vector); +#elif defined(GI_SSE2_INTRINSICS) + GI_INT32_t zero = _mm_set1_epi32(0); + return _mm_sub_epi32(zero, Vector); +#else + return -Vector; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiNegInt8(GI_INT8_t Vector) { +#if defined(GI_NEON32_INTRINSICS) + return vnegq_s8(Vector); +#elif defined(GI_SSE2_INTRINSICS) + GI_INT32_t zero = _mm_set1_epi8(0); + return _mm_sub_epi8(zero, Vector); +#else + return -Vector; +#endif +} + +GI_FORCEINLINE +GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vtstq_u32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2); + return _mm_cmpeq_epi32(tmp, _mm_setzero_si128()); +#else + GI_UINT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vaddq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT32 -GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vaddq_u32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_add_epi32(Vector1, Vector2); +#else + return Vector1 + Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vaddq_s16(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_add_epi16(Vector1, Vector2); +#else + return Vector1 + Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vaddq_s8(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_add_epi8(Vector1, Vector2); +#else + return Vector1 + Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vsubq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT32 -GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vsubq_u32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_sub_epi32(Vector1, Vector2); +#else + return Vector1 - Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vsubq_s8(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_sub_epi8(Vector1, Vector2); +#else + return Vector1 - Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmulq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) - return _mm_mul_epi32(Vector1, Vector2); + GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); + GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); + return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); +#else + return Vector1 * Vector2; +#endif +} +//! in x86, there is no int multiply, so implement it naive +GI_FORCEINLINE +GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmulq_s8(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + int8_t v1[16], v2[16], res[16]; + _mm_storeu_si128((__m128i*)v1, Vector1); + _mm_storeu_si128((__m128i*)v2, Vector2); + for (size_t id = 0; id < 16; id++) { + res[id] = v1[id] * v2[id]; + } + return _mm_loadu_si128((__m128i*)res); #else return Vector1 * Vector2; #endif } GI_FORCEINLINE -GI_INT8 -GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { +GI_INT32_t GiMultiplyAddInt32( + GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) { +#if defined(GI_NEON_INTRINSICS) + return vmlaq_s32(Vector1, Vector2, Vector3); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); +#else + return Vector1 + Vector2 * Vector3; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) { +#if defined(GI_NEON_INTRINSICS) + return vmlaq_s8(Vector1, Vector2, Vector3); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); +#else + return Vector1 + Vector2 * Vector3; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vandq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { } GI_FORCEINLINE -GI_INT8 -GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { +GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return veorq_u32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_xor_si128(Vector1, Vector2); +#else + return Vector1 ^ Vector2; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vorrq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { } GI_FORCEINLINE -GI_INT8 -GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { +GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) return vandq_s8(vmvnq_s8(VectorNot), Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_andnot_si128(VectorNot, Vector); #else - GI_INT8 Not = ~VectorNot; + GI_INT8_t Not = ~VectorNot; return (Not & Vector); #endif } GI_FORCEINLINE -GI_INT8 -GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { +GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { #if defined(GI_NEON_INTRINSICS) return veorq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) @@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { #endif } +GI_FORCEINLINE +GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { #if defined(GI_NEON_INTRINSICS) -#define GISHIFTLEFTINT32(i) \ - GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ - return vshlq_n_s32(Vector, i); \ - } - + return vshlq_n_s32(Vector, 23); #elif defined(GI_SSE2_INTRINSICS) - -#define GISHIFTLEFTINT32(i) \ - GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ - return _mm_slli_epi32(Vector, i); \ - } + return _mm_slli_epi32(Vector, 23); #else -#define GISHIFTLEFTINT32(i) \ - GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ - return Vector << i; \ - } + return Vector << 23; #endif +} -GISHIFTLEFTINT32(0) -GISHIFTLEFTINT32(1) -GISHIFTLEFTINT32(2) -GISHIFTLEFTINT32(3) - -#undef GISHIFTLEFTINT32 +GI_FORCEINLINE +GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vshrq_n_s32(Vector, 23); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_srai_epi32(Vector, 23); +#else + return Vector >> 23; +#endif +} GI_FORCEINLINE -GI_INT32 -GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) { +GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) { return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); } GI_FORCEINLINE -GI_INT8 -GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { +GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); } GI_FORCEINLINE -GI_INT32 -GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vabsq_s32(Vector); +#elif defined(GI_SSE42_INTRINSICS) + return _mm_abs_epi32(Vector); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vabsq_s16(Vector); +#elif defined(GI_SSE42_INTRINSICS) + return _mm_abs_epi16(Vector); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vabsq_s8(Vector); +#elif defined(GI_SSE42_INTRINSICS) + return _mm_abs_epi8(Vector); +#else + GI_INT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { + ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_s32(Vector1, Vector2); #elif defined(GI_SSE42_INTRINSICS) @@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT32 -GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { +GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_s32(Vector1, Vector2); #elif defined(GI_SSE42_INTRINSICS) @@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { } GI_FORCEINLINE -GI_INT8 -GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { +GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); } GI_FORCEINLINE -GI_INT8 -GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { +GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_s8(Vector1, Vector2); #elif defined(GI_SSE42_INTRINSICS) @@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { } GI_FORCEINLINE -GI_INT8 -GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { +GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_s8(Vector1, Vector2); #elif defined(GI_SSE42_INTRINSICS) @@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { } GI_FORCEINLINE -GI_INT16 -GiMoveHighLongInt8(GI_INT8 Vector) { +GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) return vmovl_s8(vget_high_s8(Vector)); #elif defined(GI_SSE42_INTRINSICS) @@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { } return _mm_loadu_si128((__m128i*)data); #else - GI_INT16 ret; + GI_INT16_t ret; int8_t* data = (int8_t*)&Vector; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); for (size_t i = 0; i < half_length; i++) { @@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { } GI_FORCEINLINE -GI_INT16 -GiMoveLowLongInt8(GI_INT8 Vector) { +GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) return vmovl_s8(vget_low_s8(Vector)); #elif defined(GI_SSE42_INTRINSICS) @@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { } return _mm_loadu_si128((__m128i*)data); #else - GI_INT16 ret; + GI_INT16_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); for (size_t i = 0; i < half_length; i++) { ret[i] = Vector[i]; @@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { } GI_FORCEINLINE -GI_INT32 -GiMoveHighLongInt16(GI_INT16 Vector) { +GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { #if defined(GI_NEON_INTRINSICS) return vmovl_s16(vget_high_s16(Vector)); #elif defined(GI_SSE42_INTRINSICS) @@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { } return _mm_loadu_si128((__m128i*)data); #else - GI_INT32 ret; + GI_INT32_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); for (size_t i = 0; i < half_length; i++) { ret[i] = Vector[half_length + i]; @@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { } GI_FORCEINLINE -GI_INT32 -GiMoveLowLongInt16(GI_INT16 Vector) { +GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { #if defined(GI_NEON_INTRINSICS) return vmovl_s16(vget_low_s16(Vector)); #elif defined(GI_SSE42_INTRINSICS) @@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { } return _mm_loadu_si128((__m128i*)data); #else - GI_INT32 ret; + GI_INT32_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); for (size_t i = 0; i < half_length; i++) { ret[i] = Vector[i]; @@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { } GI_FORCEINLINE -int32_t GiReduceAddInt8(GI_INT8 Vector) { +int32_t GiReduceAddInt8(GI_INT8_t Vector) { #if defined(GI_NEON64_INTRINSICS) return vaddlvq_s8(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) { } GI_FORCEINLINE -int8_t GiReduceMaxInt8(GI_INT8 Vector) { +int8_t GiReduceMaxInt8(GI_INT8_t Vector) { #if defined(GI_NEON64_INTRINSICS) return vmaxvq_s8(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { } GI_FORCEINLINE -int8_t GiReduceMinInt8(GI_INT8 Vector) { +int8_t GiReduceMinInt8(GI_INT8_t Vector) { #if defined(GI_NEON64_INTRINSICS) return vminvq_s8(Vector); #elif defined(GI_NEON32_INTRINSICS) @@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { //! convert to the short type with the lower bit fill the real data, the high bite //! will repeat the lower bit GI_FORCEINLINE -GI_INT8 -GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { +GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { #if defined(GI_NEON_INTRINSICS) #if __ARM_ARCH >= 8 int32x4_t vres0 = vcvtaq_s32_f32(src); @@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); return vepi8; #else - GI_INT8 ret; + GI_INT8_t ret; int length = GI_SIMD_LEN_BYTE / sizeof(float); for (int i = 0; i < length; i++) { int8_t data = Saturate(round(src[i]), -128, 127); @@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { } GI_FORCEINLINE -GI_INT8 -GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { +GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { #if defined(GI_NEON_INTRINSICS) #if __ARM_ARCH >= 8 int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); @@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); return vepi8; #else - GI_INT8 ret; + GI_INT8_t ret; int length = GI_SIMD_LEN_BYTE / sizeof(float); for (int i = 0; i < 2 * length; i++) { ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); @@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { } GI_FORCEINLINE -GI_INT8 -GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { +GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { #if defined(GI_NEON_INTRINSICS) #if __ARM_ARCH >= 8 int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); @@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); return vepi8; #else - GI_INT8 ret; + GI_INT8_t ret; int length = GI_SIMD_LEN_BYTE / sizeof(float); for (int i = 0; i < 4 * length; i++) { ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); diff --git a/dnn/src/fallback/reduce/reducer.h b/dnn/src/fallback/reduce/reducer.h index 66126b93848edf351a533db29bde88850bb04f22..38f024ba48728a6611c7ac0f9a353a125b0e2df3 100644 --- a/dnn/src/fallback/reduce/reducer.h +++ b/dnn/src/fallback/reduce/reducer.h @@ -46,25 +46,25 @@ struct MeanReducer { using ctype = int8_t; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); - GI_INT32 res[4]; + GI_INT32_t res[4]; int32_t remain; int32_t cnt; float coef; - GI_FLOAT32 vcoef; + GI_FLOAT32_t vcoef; MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { memset(res, 0, sizeof(res)); vcoef = GiBroadcastFloat32(coef); } MeanReducer() = default; void feed(const int8_t* val) { - const GI_INT8 vval = GiLoadInt8(val); - const GI_INT16 vval_low = GiMoveLowLongInt8(vval); - const GI_INT16 vval_high = GiMoveHighLongInt8(vval); + const GI_INT8_t vval = GiLoadInt8(val); + const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); + const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); - const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); - const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); - const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); - const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); + const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low); + const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low); + const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); + const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); res[0] = GiAddInt32(res[0], vval_low_low); res[1] = GiAddInt32(res[1], vval_low_high); @@ -74,11 +74,11 @@ struct MeanReducer { void feed_remain(const int8_t* val) { remain += *val; } void post(int8_t* dst) { for (int i = 0; i < 4; i += 2) { - GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); - GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); + GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); + GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); GiStoreLowInt8( - dst, - (QConverter::convert({{vitem0, vitem1}}))); + dst, (QConverter::convert( + {{vitem0, vitem1}}))); dst += 8; } } @@ -93,7 +93,7 @@ struct MeanReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32 res; + GI_FLOAT32_t res; float result; float coef; MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { @@ -113,7 +113,7 @@ struct MeanReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32 res; + GI_FLOAT32_t res; float remain; float coef; MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { @@ -140,30 +140,33 @@ struct minReducer; struct _mode##Reducer { \ using ctype = float; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32 res; \ + GI_FLOAT32_t res; \ _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ _mode##Reducer() = default; \ void feed(const float* val) { \ auto vval = GiLoadFloat32(val); \ - res = Gi##_Mode##imumFloat32(res, vval); \ + res = Gi##_Mode##NanFloat32(res, vval); \ } \ void feed_remain(const float* val) { \ auto vval = GiBroadcastFloat32(*val); \ - res = Gi##_Mode##imumFloat32(vval, res); \ + res = Gi##_Mode##NanFloat32(vval, res); \ } \ - void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ + void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ } REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits::lowest()); REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); #undef REDUCER_MAX_MIN_C1 +#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); +#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); + #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ template <> \ struct _mode##Reducer { \ using ctype = float; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32 res; \ + GI_FLOAT32_t res; \ float remain; \ _mode##Reducer(DType, size_t) { \ res = GiBroadcastFloat32(_init); \ @@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); } \ _mode##Reducer() = default; \ void feed(const float* val) { \ - GI_FLOAT32 vval = GiLoadFloat32(val); \ - res = Gi##_Mode##imumFloat32(res, vval); \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ + res = Gi##_Mode##NanFloat32(res, vval); \ } \ void feed_remain(const float* val) { \ using namespace std; \ - remain = _mode(*val, remain); \ + remain = _Mode##_NAN(*val, remain); \ } \ void post(float* dst) { GiStoreFloat32(dst, res); } \ void post_remain(float* dst) { *dst = remain; } \ @@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits::max()); REDUCER_MAX_MIN_C(max, Max, std::numeric_limits::lowest()); REDUCER_MAX_MIN_C(min, Min, std::numeric_limits::max()); #undef REDUCER_MAX_MIN_C +#undef Max_NAN +#undef Min_NAN #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ template <> \ struct _mode##Reducer { \ using ctype = int8_t; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ - GI_INT8 res; \ + GI_INT8_t res; \ _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ _mode##Reducer() = default; \ void feed(const int8_t* val) { \ - GI_INT8 vval = GiLoadInt8(val); \ + GI_INT8_t vval = GiLoadInt8(val); \ res = Gi##_Mode##imumInt8(vval, res); \ } \ void feed_remain(const int8_t* val) { \ - GI_INT8 vval = GiBroadcastInt8(*val); \ + GI_INT8_t vval = GiBroadcastInt8(*val); \ res = Gi##_Mode##imumInt8(res, vval); \ } \ void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ @@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); struct _mode##Reducer { \ using ctype = int8_t; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ - GI_INT8 res; \ + GI_INT8_t res; \ int8_t remain; \ _mode##Reducer(DType, size_t) { \ res = GiBroadcastInt8(_init); \ @@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); } \ _mode##Reducer() = default; \ void feed(const int8_t* val) { \ - GI_INT8 vval = GiLoadInt8(val); \ + GI_INT8_t vval = GiLoadInt8(val); \ res = Gi##_Mode##imumInt8(res, vval); \ } \ void feed_remain(const int8_t* val) { \ @@ -248,7 +253,7 @@ struct ProductReducer; struct _mode##Reducer { \ using ctype = float; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32 res; \ + GI_FLOAT32_t res; \ float remain; \ _mode##Reducer(DType, size_t) { \ res = GiBroadcastFloat32(_init); \ @@ -256,7 +261,7 @@ struct ProductReducer; } \ _mode##Reducer() = default; \ void feed(const float* val) { \ - GI_FLOAT32 vval = GiLoadFloat32(val); \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ res = Gi##_Mode##Float32(vval, res); \ } \ void feed_remain(const float* val) { \ @@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); struct _mode##Reducer { \ using ctype = float; \ static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ - GI_FLOAT32 res; \ + GI_FLOAT32_t res; \ float remain; \ _mode##Reducer(DType, size_t) { \ res = GiBroadcastFloat32(_init); \ @@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); } \ _mode##Reducer() = default; \ void feed(const float* val) { \ - GI_FLOAT32 vval = GiLoadFloat32(val); \ + GI_FLOAT32_t vval = GiLoadFloat32(val); \ res = Gi##_Mode##Float32(vval, res); \ } \ void feed_remain(const float* val) { \ @@ -313,7 +318,7 @@ struct SumSqrReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32 res; + GI_FLOAT32_t res; float result; SumSqrReducer(DType, size_t cnt) : result(0.0f) { MEGDNN_MARK_USED_VAR(cnt); @@ -321,7 +326,7 @@ struct SumSqrReducer { } SumSqrReducer() = default; void feed(const float* val) { - GI_FLOAT32 vval = GiLoadFloat32(val); + GI_FLOAT32_t vval = GiLoadFloat32(val); res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); } void feed_remain(const float* val) { @@ -338,7 +343,7 @@ struct SumSqrReducer { using ctype = float; static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); - GI_FLOAT32 res; + GI_FLOAT32_t res; float remain; SumSqrReducer(DType, size_t cnt) : remain(0.0f) { MEGDNN_MARK_USED_VAR(cnt); @@ -346,7 +351,7 @@ struct SumSqrReducer { } SumSqrReducer() = default; void feed(const float* val) { - GI_FLOAT32 vval = GiLoadFloat32(val); + GI_FLOAT32_t vval = GiLoadFloat32(val); res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); } void feed_remain(const float* val) { remain += (*val) * (*val); }