提交 a017bed3 编写于 作者: M Megvii Engine Team

fix(fallback): reman general intrinsic type and add more intrinsic

GitOrigin-RevId: 37409bae9a9a0e8c324f546e2ed985566cca18c7
上级 fd6f8e58
......@@ -82,29 +82,33 @@
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
typedef __m256 GI_FLOAT32;
typedef __m256i GI_UINT8;
typedef __m256i GI_INT8;
typedef __m256i GI_INT16;
typedef __m256i GI_INT32;
typedef __m256 GI_FLOAT32_t;
typedef __m256i GI_UINT8_t;
typedef __m256i GI_INT8_t;
typedef __m256i GI_INT16_t;
typedef __m256i GI_INT32_t;
typedef __m256i GI_UINT32_t;
#elif defined(GI_NEON_INTRINSICS)
typedef float32x4_t GI_FLOAT32;
typedef uint8x16_t GI_UINT8;
typedef int8x16_t GI_INT8;
typedef int16x8_t GI_INT16;
typedef int32x4_t GI_INT32;
typedef float32x4_t GI_FLOAT32_t;
typedef uint8x16_t GI_UINT8_t;
typedef int8x16_t GI_INT8_t;
typedef int16x8_t GI_INT16_t;
typedef int32x4_t GI_INT32_t;
typedef uint32x4_t GI_UINT32_t;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
typedef __m128 GI_FLOAT32;
typedef __m128i GI_UINT8;
typedef __m128i GI_INT8;
typedef __m128i GI_INT16;
typedef __m128i GI_INT32;
typedef __m128 GI_FLOAT32_t;
typedef __m128i GI_UINT8_t;
typedef __m128i GI_INT8_t;
typedef __m128i GI_INT16_t;
typedef __m128i GI_INT32_t;
typedef __m128i GI_UINT32_t;
#else
typedef float GI_FLOAT32 __attribute__((vector_size(16)));
typedef uint8_t GI_UINT8 __attribute__((vector_size(16)));
typedef int8_t GI_INT8 __attribute__((vector_size(16)));
typedef int16_t GI_INT16 __attribute__((vector_size(16)));
typedef int32_t GI_INT32 __attribute__((vector_size(16)));
typedef float GI_FLOAT32_t __attribute__((vector_size(16)));
typedef uint8_t GI_UINT8_t __attribute__((vector_size(16)));
typedef int8_t GI_INT8_t __attribute__((vector_size(16)));
typedef int16_t GI_INT16_t __attribute__((vector_size(16)));
typedef int32_t GI_INT32_t __attribute__((vector_size(16)));
typedef uint32_t GI_UINT32_t __attribute__((vector_size(16)));
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
......@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
#define Min(a, b) (a) < (b) ? (a) : (b)
typedef struct {
GI_INT32 val[2];
} GI_INT32_V2;
GI_INT32_t val[2];
} GI_INT32_V2_t;
typedef struct {
GI_INT32 val[4];
} GI_INT32_V4;
GI_INT32_t val[4];
} GI_INT32_V4_t;
typedef struct {
GI_FLOAT32 val[2];
} GI_FLOAT32_V2;
GI_FLOAT32_t val[2];
} GI_FLOAT32_V2_t;
typedef struct {
GI_FLOAT32 val[4];
} GI_FLOAT32_V4;
GI_FLOAT32_t val[4];
} GI_FLOAT32_V4_t;
typedef struct {
GI_INT16_t val[2];
} GI_INT16_V2_t;
typedef struct {
GI_INT8_t val[2];
} GI_INT8_V2_t;
GI_FORCEINLINE
GI_INT32
GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vorrq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s32(vmvnq_s32(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return veorq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......
......@@ -14,20 +14,51 @@
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiReinterpretAsInt32(GI_FLOAT32 In) {
GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_s32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castps_si128(In);
#else
return GI_INT32(In);
return *(GI_INT32_t*)(&In);
#endif
}
GI_FORCEINLINE
GI_INT32
GiRoundAsInt32(GI_FLOAT32 Vector) {
GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_u32_f32(In);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castps_si128(In);
#else
return *(GI_UINT32_t*)(&In);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_f32_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castsi128_ps(Vector);
#else
return *(GI_FLOAT32_t*)(&Vector);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_f32_u32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castsi128_ps(Vector);
#else
return *(GI_FLOAT32_t*)(&Vector);
#endif
}
GI_FORCEINLINE
GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
return vcvtaq_s32_f32(Vector);
......@@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) {
return _mm_castps_si128(
_mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
#else
GI_INT32 ret;
GI_INT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = (int32_t)round(Vector[i]);
}
......@@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
GI_FLOAT32
GiCastToFloat32(GI_INT32 Vector) {
GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vcvtq_f32_s32(Vector);
return vcvtq_s32_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_cvtepi32_ps(Vector);
return _mm_cvttps_epi32(Vector);
#else
GI_FLOAT32 ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = float(Vector[i]);
GI_INT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = (int32_t)(Vector[i]);
}
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiReinterpretAsFloat32(GI_INT32 Vector) {
GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_f32_s32(Vector);
return vcvtq_f32_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castsi128_ps(Vector);
return _mm_cvtepi32_ps(Vector);
#else
return GI_FLOAT32(Vector);
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = float(Vector[i]);
}
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32(float Value) {
GI_FLOAT32_t GiBroadcastFloat32(float Value) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_f32(Value);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_set1_ps(Value);
#else
GI_FLOAT32 ret;
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Value;
}
......@@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) {
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32(const float* Value) {
GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_dup_f32(Value);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_load_ps1(Value);
#else
GI_FLOAT32 ret;
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = *Value;
}
......@@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) {
}
GI_FORCEINLINE
GI_FLOAT32
GiZeroFloat32(void) {
GI_FLOAT32_t GiZeroFloat32(void) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_f32(0.0f);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -128,14 +158,13 @@ GiZeroFloat32(void) {
}
GI_FORCEINLINE
GI_FLOAT32
GiLoadFloat32(const float* Buffer) {
GI_FLOAT32_t GiLoadFloat32(const float* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_f32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_loadu_ps(Buffer);
#else
GI_FLOAT32 ret;
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Buffer[i];
}
......@@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) {
}
GI_FORCEINLINE
void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) {
void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_f32(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) {
#endif
}
GI_FORCEINLINE
void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_f32(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
_mm_store_ps(Buffer, Vector);
#else
GiStoreFloat32(Buffer, Vector);
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
vst1q_lane_f32(Buffer, Vector, i); \
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
vst1q_lane_f32(Buffer, Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
*Buffer = Vector[i]; \
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
*Buffer = Vector[i]; \
}
#endif
......@@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3)
#undef GISTORELANEFLOAT32
#if defined(GI_NEON_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return vgetq_lane_f32(Vector, i); \
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
return vgetq_lane_f32(Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return Vector[i]; \
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
return Vector[i]; \
}
#endif
......@@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3)
#undef GIEXTRACTLANEFLOAT32
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vzip1q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#elif defined(GI_SSE2_INTRINSICS)
return _mm_unpacklo_ps(Vector1, Vector2);
#else
GI_FLOAT32 ret;
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
ret[2 * i] = Vector1[i];
ret[2 * i + 1] = Vector2[i];
......@@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vzip2q_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#elif defined(GI_SSE2_INTRINSICS)
return _mm_unpackhi_ps(Vector1, Vector2);
#else
GI_FLOAT32 ret;
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i];
ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i];
......@@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) {
GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_n_f32(Vector1, Scaler);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler);
GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler);
return _mm_mul_ps(Vector1, Vector2);
#else
return Vector1 * Scaler;
......@@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiMultiplyAddFloat32(
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return vfmaq_f32(VectorSum, Vector1, Vector2);
#else
return vmlaq_f32(VectorSum, Vector1, Vector2);
#endif
#elif defined(GI_FMA3_INTRINSICS)
return _mm_fmadd_ps(Vector1, Vector2, VectorSum);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) {
GI_FLOAT32_t GiMultiplySubFloat32(
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmlaq_n_f32(VectorSum, Vector, Scalar);
return vmlsq_f32(VectorSum, Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2));
#else
return VectorSum - Vector1 * Vector2;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiMultiplyAddScalarFloat32(
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) {
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return vfmaq_n_f32(VectorSum, Vector, Scalar);
#else
return vfmla_n_f32(VectorSum, Vector, Scalar);
#endif
#elif defined(GI_SSE2_INTRINSICS)
return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
#else
return VectorSum + Vector * Scalar;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
#if defined(__ARM_FEATURE_FMA)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#endif
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return GiMultiplyAddScalarFloat32( \
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return GiMultiplyAddScalarFloat32( \
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
}
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
......@@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32(3)
#undef GIMULTIPLYADDLANFLOAT32
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return VectorSum + Vector1 * Vector2[i]; \
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return VectorSum + Vector1 * Vector2[i]; \
}
GIMULTIPLYADDLANFLOAT32(0)
GIMULTIPLYADDLANFLOAT32(1)
......@@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3)
#endif
GI_FORCEINLINE
GI_FLOAT32
GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vdivq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON64_INTRINSICS)
return vrecpsq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t two = _mm_set1_ps(2.0f);
return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2));
#else
return (2.0f - Vector1 * Vector2);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
return vrecpeq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t ones = _mm_set1_ps(1.0f);
return _mm_div_ps(ones, Vector);
#else
return 1 / Vector;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
return vnegq_f32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t zero = _mm_set1_ps(0.0f);
return _mm_sub_ps(zero, Vector);
#else
return -Vector;
#endif
}
GI_FORCEINLINE
GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vcgtq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2));
#else
GI_UINT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0;
}
return ret;
#endif
}
GI_FORCEINLINE
GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
return vcleq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_cmpgt_ps(Vector1, Vector2);
return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2));
#else
return Vector1 > Vector2;
GI_UINT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0;
}
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vcltq_f32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2));
#else
GI_UINT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0;
}
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_SSE2_INTRINSICS)
return _mm_and_ps(Vector1, Vector2);
#else
return GiReinterpretAsFloat32(
return GiReintInt32ToFloat32(
GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_SSE2_INTRINSICS)
return _mm_or_ps(Vector1, Vector2);
#else
return GiReinterpretAsFloat32(
return GiReintInt32ToFloat32(
GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) {
GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) {
#if defined(GI_SSE2_INTRINSICS)
return _mm_andnot_ps(VectorNot, Vector);
#else
return GiReinterpretAsFloat32(GiAndNotInt32(
return GiReintInt32ToFloat32(GiAndNotInt32(
GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_SSE2_INTRINSICS)
return _mm_xor_ps(Vector1, Vector2);
#else
return GiReinterpretAsFloat32(
return GiReintInt32ToFloat32(
GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
GI_FLOAT32_t GiBlendFloat32(
GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) {
return GiOrFloat32(
GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1));
}
......@@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiBSLFloat32(
GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vbslq_f32(Selection, Vector1, Vector2);
#else
return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection));
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
return _mm_max_ps(Vector1, Vector2);
#else
GI_FLOAT32_t max;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
max[i] = Max(Vector1[i], Vector2[i]);
}
return max;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_f32(Vector1, Vector2);
#elif defined(GI_NEON32_INTRINSICS)
return _mm_min_ps(Vector1, Vector2);
#else
GI_FLOAT32_t min;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
min[i] = Min(Vector1[i], Vector2[i]);
}
return min;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f32(Vector1, Vector2);
#else
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
GI_FLOAT32 max;
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FLOAT32_t max;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
max[i] = MAX_NAN(Vector1[i], Vector2[i]);
}
......@@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_f32(Vector1, Vector2);
#else
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
GI_FLOAT32 min;
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
GI_FLOAT32_t min;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
min[i] = MIN_NAN(Vector1[i], Vector2[i]);
}
......@@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) {
GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) {
Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value);
Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value);
return Value;
}
GI_FORCEINLINE
float GiReduceAddFloat32(GI_FLOAT32 Vector) {
float GiReduceAddFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
Vector = vpaddq_f32(Vector, Vector);
Vector = vpaddq_f32(Vector, Vector);
......@@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) {
float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
float32x2_t low = vget_low_f32(Vector);
float32x2_t high = vget_high_f32(Vector);
......@@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) {
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vmaxvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
VectorLow = vpmax_f32(VectorLow, VectorHigh);
return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
Vector = GiMaximumFloat32(
Vector = GiMaxNanFloat32(
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
Vector = GiMaximumFloat32(
Vector = GiMaxNanFloat32(
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
return GiExtractLane0Float32(Vector);
#else
......@@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vminvq_f32(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
VectorLow = vpmin_f32(VectorLow, VectorHigh);
return vget_lane_f32(VectorLow, 0);
#elif defined(GI_SSE2_INTRINSICS)
Vector = GiMinimumFloat32(
Vector = GiMinNanFloat32(
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
Vector = GiMinimumFloat32(
Vector = GiMinNanFloat32(
Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
return GiExtractLane0Float32(Vector);
#else
......@@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) {
#if defined(GI_NEON64_INTRINSICS)
return vabsq_f32(Vector1);
#elif defined(GI_SSE2_INTRINSICS)
union {
unsigned int int_val;
float float_val;
} value;
value.int_val = 0x7fffffff;
return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val));
#else
GI_FLOAT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i];
}
return ret;
#endif
}
// vim: syntax=cpp.doxygen
......@@ -14,14 +14,13 @@
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiBroadcastInt32(int32_t Value) {
GI_INT32_t GiBroadcastInt32(int32_t Value) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_s32(Value);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_set1_epi32(Value);
#else
GI_INT32 ret;
GI_INT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = Value;
}
......@@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) {
}
GI_FORCEINLINE
GI_INT8
GiBroadcastInt8(int8_t Value) {
GI_UINT32_t GiBroadcastUint32(int32_t Value) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_u32(Value);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_set1_epi32(Value);
#else
GI_UINT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = Value;
}
return ret;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiBroadcastInt8(int8_t Value) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_s8(Value);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_set1_epi8(Value);
#else
GI_INT8 ret;
GI_INT8_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
ret[i] = Value;
}
......@@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) {
}
GI_FORCEINLINE
GI_INT32
GiLoadInt32(const int32_t* Buffer) {
GI_INT32_t GiLoadInt32(const int32_t* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_s32(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_loadu_si128((const __m128i*)Buffer);
#else
GI_INT32 ret;
GI_INT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = Buffer[i];
}
......@@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) {
}
GI_FORCEINLINE
GI_INT8
GiLoadInt8(const int8_t* Buffer) {
GI_INT8_t GiLoadInt8(const int8_t* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_s8(Buffer);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_loadu_si128((const __m128i*)Buffer);
#else
GI_INT8 ret;
GI_INT8_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
ret[i] = Buffer[i];
}
......@@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) {
}
GI_FORCEINLINE
void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) {
void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_s32(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) {
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
vst1q_lane_s32(Buffer, Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \
_mm_store_ss( \
(float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
*Buffer = Vector[i]; \
}
#endif
GISTORELANEINT32(0)
GISTORELANEINT32(1)
GISTORELANEINT32(2)
GISTORELANEINT32(3)
#undef GISTORELANEFLOAT32
GI_FORCEINLINE
GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vreinterpretq_s8_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
return Vector;
#else
return *(GI_INT8_t*)&Vector;
#endif
}
GI_FORCEINLINE
void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_s16(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_si128((__m128i*)Buffer, Vector);
#else
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) {
Buffer[i] = Vector[i];
}
#endif
}
GI_FORCEINLINE
void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_s8(Buffer, Vector);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) {
void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1_s8(Buffer, vget_low_s8(Vector));
#elif defined(GI_SSE2_INTRINSICS)
......@@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) {
void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1_s8(Buffer, vget_high_s8(Vector));
#elif defined(GI_SSE2_INTRINSICS)
......@@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiNegInt32(GI_INT32_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
return vnegq_s32(Vector);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT32_t zero = _mm_set1_epi32(0);
return _mm_sub_epi32(zero, Vector);
#else
return -Vector;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiNegInt8(GI_INT8_t Vector) {
#if defined(GI_NEON32_INTRINSICS)
return vnegq_s8(Vector);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT32_t zero = _mm_set1_epi8(0);
return _mm_sub_epi8(zero, Vector);
#else
return -Vector;
#endif
}
GI_FORCEINLINE
GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vtstq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2);
return _mm_cmpeq_epi32(tmp, _mm_setzero_si128());
#else
GI_UINT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0;
}
return ret;
#endif
}
GI_FORCEINLINE
GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_add_epi32(Vector1, Vector2);
#else
return Vector1 + Vector2;
#endif
}
GI_FORCEINLINE
GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_s16(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_add_epi16(Vector1, Vector2);
#else
return Vector1 + Vector2;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_add_epi8(Vector1, Vector2);
#else
return Vector1 + Vector2;
#endif
}
GI_FORCEINLINE
GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_sub_epi32(Vector1, Vector2);
#else
return Vector1 - Vector2;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_sub_epi8(Vector1, Vector2);
#else
return Vector1 - Vector2;
#endif
}
GI_FORCEINLINE
GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_s32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_mul_epi32(Vector1, Vector2);
GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1);
GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2);
return _mm_cvttps_epi32(_mm_mul_ps(v0, v1));
#else
return Vector1 * Vector2;
#endif
}
//! in x86, there is no int multiply, so implement it naive
GI_FORCEINLINE
GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
int8_t v1[16], v2[16], res[16];
_mm_storeu_si128((__m128i*)v1, Vector1);
_mm_storeu_si128((__m128i*)v2, Vector2);
for (size_t id = 0; id < 16; id++) {
res[id] = v1[id] * v2[id];
}
return _mm_loadu_si128((__m128i*)res);
#else
return Vector1 * Vector2;
#endif
}
GI_FORCEINLINE
GI_INT8
GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
GI_INT32_t GiMultiplyAddInt32(
GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) {
#if defined(GI_NEON_INTRINSICS)
return vmlaq_s32(Vector1, Vector2, Vector3);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3));
#else
return Vector1 + Vector2 * Vector3;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) {
#if defined(GI_NEON_INTRINSICS)
return vmlaq_s8(Vector1, Vector2, Vector3);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3));
#else
return Vector1 + Vector2 * Vector3;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return veorq_u32(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_xor_si128(Vector1, Vector2);
#else
return Vector1 ^ Vector2;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vorrq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) {
GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vandq_s8(vmvnq_s8(VectorNot), Vector);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_andnot_si128(VectorNot, Vector);
#else
GI_INT8 Not = ~VectorNot;
GI_INT8_t Not = ~VectorNot;
return (Not & Vector);
#endif
}
GI_FORCEINLINE
GI_INT8
GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return veorq_s8(Vector1, Vector2);
#elif defined(GI_SSE2_INTRINSICS)
......@@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
#endif
}
GI_FORCEINLINE
GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return vshlq_n_s32(Vector, i); \
}
return vshlq_n_s32(Vector, 23);
#elif defined(GI_SSE2_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return _mm_slli_epi32(Vector, i); \
}
return _mm_slli_epi32(Vector, 23);
#else
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return Vector << i; \
}
return Vector << 23;
#endif
}
GISHIFTLEFTINT32(0)
GISHIFTLEFTINT32(1)
GISHIFTLEFTINT32(2)
GISHIFTLEFTINT32(3)
#undef GISHIFTLEFTINT32
GI_FORCEINLINE
GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vshrq_n_s32(Vector, 23);
#elif defined(GI_SSE2_INTRINSICS)
return _mm_srai_epi32(Vector, 23);
#else
return Vector >> 23;
#endif
}
GI_FORCEINLINE
GI_INT32
GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) {
GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) {
return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1));
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) {
GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) {
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1));
}
GI_FORCEINLINE
GI_INT32
GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiAbsInt32(GI_INT32_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vabsq_s32(Vector);
#elif defined(GI_SSE42_INTRINSICS)
return _mm_abs_epi32(Vector);
#else
GI_INT32_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
}
return ret;
#endif
}
GI_FORCEINLINE
GI_INT16_t GiAbsInt16(GI_INT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vabsq_s16(Vector);
#elif defined(GI_SSE42_INTRINSICS)
return _mm_abs_epi16(Vector);
#else
GI_INT16_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) {
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
}
return ret;
#endif
}
GI_FORCEINLINE
GI_INT8_t GiAbsInt8(GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vabsq_s8(Vector);
#elif defined(GI_SSE42_INTRINSICS)
return _mm_abs_epi8(Vector);
#else
GI_INT8_t ret;
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) {
ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i];
}
return ret;
#endif
}
GI_FORCEINLINE
GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_s32(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
......@@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_s32(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
......@@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) {
GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) {
return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1));
}
GI_FORCEINLINE
GI_INT8
GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_s8(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
......@@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_s8(Vector1, Vector2);
#elif defined(GI_SSE42_INTRINSICS)
......@@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT16
GiMoveHighLongInt8(GI_INT8 Vector) {
GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vmovl_s8(vget_high_s8(Vector));
#elif defined(GI_SSE42_INTRINSICS)
......@@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
}
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT16 ret;
GI_INT16_t ret;
int8_t* data = (int8_t*)&Vector;
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
for (size_t i = 0; i < half_length; i++) {
......@@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT16
GiMoveLowLongInt8(GI_INT8 Vector) {
GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vmovl_s8(vget_low_s8(Vector));
#elif defined(GI_SSE42_INTRINSICS)
......@@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
}
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT16 ret;
GI_INT16_t ret;
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[i];
......@@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiMoveHighLongInt16(GI_INT16 Vector) {
GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vmovl_s16(vget_high_s16(Vector));
#elif defined(GI_SSE42_INTRINSICS)
......@@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
}
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT32 ret;
GI_INT32_t ret;
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[half_length + i];
......@@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiMoveLowLongInt16(GI_INT16 Vector) {
GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
return vmovl_s16(vget_low_s16(Vector));
#elif defined(GI_SSE42_INTRINSICS)
......@@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
return _mm_loadu_si128((__m128i*)data);
#else
GI_INT32 ret;
GI_INT32_t ret;
size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t);
for (size_t i = 0; i < half_length; i++) {
ret[i] = Vector[i];
......@@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
int32_t GiReduceAddInt8(GI_INT8 Vector) {
int32_t GiReduceAddInt8(GI_INT8_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vaddlvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
int8_t GiReduceMaxInt8(GI_INT8 Vector) {
int8_t GiReduceMaxInt8(GI_INT8_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vmaxvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
int8_t GiReduceMinInt8(GI_INT8 Vector) {
int8_t GiReduceMinInt8(GI_INT8_t Vector) {
#if defined(GI_NEON64_INTRINSICS)
return vminvq_s8(Vector);
#elif defined(GI_NEON32_INTRINSICS)
......@@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
//! convert to the short type with the lower bit fill the real data, the high bite
//! will repeat the lower bit
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) {
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t vres0 = vcvtaq_s32_f32(src);
......@@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
__m128i vepi8 = _mm_packs_epi16(vepi16, vepi16);
return vepi8;
#else
GI_INT8 ret;
GI_INT8_t ret;
int length = GI_SIMD_LEN_BYTE / sizeof(float);
for (int i = 0; i < length; i++) {
int8_t data = Saturate(round(src[i]), -128, 127);
......@@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) {
GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) {
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]);
......@@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) {
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0);
return vepi8;
#else
GI_INT8 ret;
GI_INT8_t ret;
int length = GI_SIMD_LEN_BYTE / sizeof(float);
for (int i = 0; i < 2 * length; i++) {
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127);
......@@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) {
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) {
GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) {
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]);
......@@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) {
__m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1);
return vepi8;
#else
GI_INT8 ret;
GI_INT8_t ret;
int length = GI_SIMD_LEN_BYTE / sizeof(float);
for (int i = 0; i < 4 * length; i++) {
ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127);
......
......@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using ctype = int8_t;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t);
GI_INT32 res[4];
GI_INT32_t res[4];
int32_t remain;
int32_t cnt;
float coef;
GI_FLOAT32 vcoef;
GI_FLOAT32_t vcoef;
MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) {
memset(res, 0, sizeof(res));
vcoef = GiBroadcastFloat32(coef);
}
MeanReducer() = default;
void feed(const int8_t* val) {
const GI_INT8 vval = GiLoadInt8(val);
const GI_INT16 vval_low = GiMoveLowLongInt8(vval);
const GI_INT16 vval_high = GiMoveHighLongInt8(vval);
const GI_INT8_t vval = GiLoadInt8(val);
const GI_INT16_t vval_low = GiMoveLowLongInt8(vval);
const GI_INT16_t vval_high = GiMoveHighLongInt8(vval);
const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low);
const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low);
const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high);
const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low);
const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low);
const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high);
const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high);
res[0] = GiAddInt32(res[0], vval_low_low);
res[1] = GiAddInt32(res[1], vval_low_high);
......@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
void feed_remain(const int8_t* val) { remain += *val; }
void post(int8_t* dst) {
for (int i = 0; i < 4; i += 2) {
GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef);
GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef);
GiStoreLowInt8(
dst,
(QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}})));
dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(
{{vitem0, vitem1}})));
dst += 8;
}
}
......@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float result;
float coef;
MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) {
......@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float remain;
float coef;
MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) {
......@@ -140,30 +140,33 @@ struct minReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(res, vval); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = Gi##_Mode##imumFloat32(vval, res); \
res = Gi##_Mode##NanFloat32(vval, res); \
} \
void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \
void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \
}
REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C1
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
res = Gi##_Mode##imumFloat32(res, vval); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##NanFloat32(res, vval); \
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _mode(*val, remain); \
remain = _Mode##_NAN(*val, remain); \
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
......@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest());
REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max());
#undef REDUCER_MAX_MIN_C
#undef Max_NAN
#undef Min_NAN
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
GI_INT8_t res; \
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8 vval = GiBroadcastInt8(*val); \
GI_INT8_t vval = GiBroadcastInt8(*val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
......@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8 res; \
GI_INT8_t res; \
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastInt8(_init); \
......@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8 vval = GiLoadInt8(val); \
GI_INT8_t vval = GiLoadInt8(val); \
res = Gi##_Mode##imumInt8(res, vval); \
} \
void feed_remain(const int8_t* val) { \
......@@ -248,7 +253,7 @@ struct ProductReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -256,7 +261,7 @@ struct ProductReducer;
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
......@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32 res; \
GI_FLOAT32_t res; \
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
......@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32 vval = GiLoadFloat32(val); \
GI_FLOAT32_t vval = GiLoadFloat32(val); \
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
......@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float result;
SumSqrReducer(DType, size_t cnt) : result(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
......@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) {
......@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
using ctype = float;
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float);
GI_FLOAT32 res;
GI_FLOAT32_t res;
float remain;
SumSqrReducer(DType, size_t cnt) : remain(0.0f) {
MEGDNN_MARK_USED_VAR(cnt);
......@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
}
SumSqrReducer() = default;
void feed(const float* val) {
GI_FLOAT32 vval = GiLoadFloat32(val);
GI_FLOAT32_t vval = GiLoadFloat32(val);
res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res);
}
void feed_remain(const float* val) { remain += (*val) * (*val); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册