From 7d7cc3c8da8d1e84dc45c42b3980aa25e45f9d52 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 14 Jun 2022 18:09:59 +0800 Subject: [PATCH] feat(gi/riscv): add gi support with risc-v GitOrigin-RevId: a28fec3ce57c70d491845b8546c3b0ffbdf65770 --- .../fallback/general_intrinsic/gi_common.h | 446 +++++-- dnn/src/fallback/general_intrinsic/gi_float.h | 388 +++++- dnn/src/fallback/general_intrinsic/gi_int.h | 241 +++- dnn/test/fallback/gi.cpp | 1128 +++++++++++++++-- dnn/test/main.cpp | 14 + 5 files changed, 2020 insertions(+), 197 deletions(-) diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 944c702e..88ef6402 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -17,6 +17,10 @@ #endif #endif +#if defined(__riscv_vector) +#include +#endif + #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) #define GI_TARGET_X86 #endif @@ -47,7 +51,7 @@ #define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden"))) #endif -#if defined(GI_TARGET_ARM) +#if defined(GI_TARGET_ARM) && defined(__ARM_NEON) #define GI_NEON_INTRINSICS #if defined(__aarch64__) #define GI_NEON64_INTRINSICS @@ -72,6 +76,9 @@ #define GI_SSE2_INTRINSICS #endif #endif +#if defined(__riscv_vector) +#define GI_RVV_INTRINSICS +#endif #if defined(GI_TEST_NAIVE) #undef GI_NEON_INTRINSICS @@ -82,6 +89,7 @@ #undef GI_AVX_INTRINSICS #undef GI_SSE42_INTRINSICS #undef GI_SSE2_INTRINSICS +#undef GI_RVV_INTRINSICS #endif //! general intrinsic support dynamic length simd, if avx or avx2 the simd @@ -95,6 +103,10 @@ defined(GI_SSE42_INTRINSICS) #define GI_SIMD_LEN 128 #define GI_SIMD_LEN_BYTE 16 +#elif defined(GI_RVV_INTRINSICS) +//! TODO: make gi algo usable for other GI_SIMD_LEN/GI_SIMD_LEN_BYTE +#define GI_SIMD_LEN 128 +#define GI_SIMD_LEN_BYTE 16 #else //! if no simd hardware support, the simd is implemented by C, default set to //! 128 @@ -112,6 +124,7 @@ enum GiSimdType { GI_SSE42, GI_SSE2, GI_NEON, + GI_RVV, }; #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ @@ -246,17 +259,41 @@ typedef __m64_128 float32x2_t; return res64; #define _sse_vextq_s32(a, b, c) _MM_ALIGNR_EPI8(b, a, c * 4) #define _sse_vget_lane_f32(vec, lane) vec.m64_f32[lane] +#elif defined(GI_RVV_INTRINSICS) +#define __gi_simd_type GI_RVV +typedef vfloat32m1_t GI_FLOAT32_t; +typedef vuint8m1_t GI_UINT8_t; +typedef vint8m1_t GI_INT8_t; +typedef vint16m1_t GI_INT16_t; +typedef vint32m1_t GI_INT32_t; +typedef vuint32m1_t GI_UINT32_t; +//! FIXME: nezha D1 do not support vmv.x.s instruct +//! as a workaround, define GI_INT64_t to naive +typedef int64_t GI_INT64_RVV_WORKAROUND_t + __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef GI_INT64_RVV_WORKAROUND_t GI_INT64_t; +typedef vfloat32m1x2_t GI_FLOAT32_V2_t; +typedef vfloat32m1x3_t GI_FLOAT32_V3_t; +typedef vfloat32m1x4_t GI_FLOAT32_V4_t; +typedef vint32m1x2_t GI_INT32_V2_t; +typedef vint32m1x4_t GI_INT32_V4_t; +typedef vint16m1x2_t GI_INT16_V2_t; +typedef vint8m1x2_t GI_INT8_V2_t; +//! vfloat32mf2_t usable at RVV1.0, now we support 0.7, as +//! a workaround, we use vfloat32m1_t instead +typedef vfloat32m1_t float32x2_t; + #else #define __gi_simd_type GI_NAIVE -typedef float GI_FLOAT32_t __attribute__((vector_size(16))); -typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); -typedef int8_t GI_INT8_t __attribute__((vector_size(16))); -typedef int16_t GI_INT16_t __attribute__((vector_size(16))); -typedef int32_t GI_INT32_t __attribute__((vector_size(16))); -typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); -typedef int64_t GI_INT64_t __attribute__((vector_size(16))); -#if !defined(__arm__) && !defined(__aarch64__) -typedef float float32x2_t __attribute__((vector_size(8))); +typedef float GI_FLOAT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef uint8_t GI_UINT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int8_t GI_INT8_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int16_t GI_INT16_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int32_t GI_INT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef uint32_t GI_UINT32_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int64_t GI_INT64_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +#if !defined(__arm__) && !defined(__aarch64__) || !defined(__ARM_NEON) +typedef float float32x2_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2))); #endif typedef float float32_t; #endif @@ -265,14 +302,14 @@ typedef float float32_t; //! for example: GiAbsInt32 do not imp SSE2 case //! when *_t will define as _m128*(may be long long) //! vector index do not have same logic as naive vector -typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(16))); -typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(16))); -typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(16))); -typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(16))); -typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(16))); -typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(16))); -typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(16))); -typedef float float32x2_NAIVE_t __attribute__((vector_size(8))); +typedef float GI_FLOAT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef uint8_t GI_UINT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int8_t GI_INT8_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int16_t GI_INT16_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int32_t GI_INT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef uint32_t GI_UINT32_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef int64_t GI_INT64_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE))); +typedef float float32x2_NAIVE_t __attribute__((vector_size(GI_SIMD_LEN_BYTE / 2))); typedef struct { GI_INT32_NAIVE_t val[2]; } GI_INT32_V2_NAIVE_t; @@ -301,22 +338,7 @@ typedef struct { GI_INT8_NAIVE_t val[2]; } GI_INT8_V2_NAIVE_t; -#define Max(a, b) (a) > (b) ? (a) : (b) -#define Min(a, b) (a) < (b) ? (a) : (b) - -#if defined(GI_NEON_INTRINSICS) -#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS) -#define v_fma_ps_f32(c, b, a) vfmaq_f32((c), (b), (a)) -#define v_fma_n_f32(c, b, a) vfmaq_n_f32((c), (b), (a)) -#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane)) -#else -#define v_fma_ps_f32(c, b, a) vmlaq_f32((c), (b), (a)) -#define v_fma_n_f32(c, b, a) vmlaq_n_f32((c), (b), (a)) -#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane)) -#endif -#endif - -#if !defined(GI_NEON_INTRINSICS) +#if !defined(GI_NEON_INTRINSICS) && !defined(GI_RVV_INTRINSICS) typedef struct { GI_INT32_t val[2]; } GI_INT32_V2_t; @@ -344,61 +366,272 @@ typedef struct { typedef struct { GI_INT8_t val[2]; } GI_INT8_V2_t; -#endif -GI_FORCEINLINE -GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { -#if defined(GI_NEON_INTRINSICS) - return vandq_s32(Vector1, Vector2); -#elif defined(GI_SSE2_INTRINSICS) - return _mm_and_si128(Vector1, Vector2); +#endif +//! variable length type intrinsic can not be a member of c++ class +//! caused by can not do sizeof at build stage, for example RVV and SVE +//! so we define a type_CLASS to solve this case +//! some variable length type intrinsic can not do array subscript, for +//! example RVV, so we define a GiGetSubVector_xx function to solve this +//! case. when fix-len type in fact will do nothing +#if defined(GI_RVV_INTRINSICS) +typedef GI_FLOAT32_NAIVE_t GI_FLOAT32_FIXLEN_t; +typedef GI_FLOAT32_V2_NAIVE_t GI_FLOAT32_FIXLEN_V2_t; +typedef GI_UINT8_NAIVE_t GI_UINT8_FIXLEN_t; +typedef GI_INT8_NAIVE_t GI_INT8_FIXLEN_t; +typedef GI_INT16_NAIVE_t GI_INT16_FIXLEN_t; +typedef GI_INT32_NAIVE_t GI_INT32_FIXLEN_t; +typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t; + +//! get subvector +#define GiGetSubVectorFloat32V2(s, index) vget_f32m1x2_f32m1(s, index) +#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index) +#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index) + +#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index) +#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index) + +#define GiGetSubVectorInt16V2(s, index) vget_i16m1x2_i16m1(s, index) + +#define GiGetSubVectorInt8V2(s, index) vget_i8m1x2_i8m1(s, index) + +//! insert subvector +#define GiSetSubVectorFloat32V2(d, index, s) d = vset_f32m1x2(d, index, s) +#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s) +#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s) + +#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s) +#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s) + +#define GiSetSubVectorInt16V2(d, index, s) d = vset_i16m1x2(d, index, s) + +#define GiSetSubVectorInt8V2(d, index, s) d = vset_i8m1x2(d, index, s) + +//! convert +#define GiFloat32Type2FixLenType(s) \ + __extension__({ \ + GI_FLOAT32_FIXLEN_t d; \ + vse32_v_f32m1((float*)&d, s, GI_SIMD_LEN_BYTE / sizeof(float)); \ + d; \ + }) + +#define GiFixLenType2GiFloat32Type(s) \ + __extension__({ \ + GI_FLOAT32_t d; \ + d = vle32_v_f32m1((float*)&s, GI_SIMD_LEN_BYTE / sizeof(float)); \ + d; \ + }) + +#define GiFloat32Type2FixLenV2Type(s) \ + __extension__({ \ + GI_FLOAT32_FIXLEN_V2_t d; \ + d.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 0)); \ + d.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V2(s, 1)); \ + d; \ + }) + +#define GiFixLenType2GiFloat32V2Type(s) \ + __extension__({ \ + GI_FLOAT32_V2_t d; \ + GiSetSubVectorFloat32V2(d, 0, GiFixLenType2GiFloat32Type(s.val[0])); \ + GiSetSubVectorFloat32V2(d, 1, GiFixLenType2GiFloat32Type(s.val[1])); \ + d; \ + }) + +#define GiUint8Type2FixLenType(s) \ + __extension__({ \ + GI_UINT8_FIXLEN_t d; \ + vse8_v_u8m1((uint8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \ + d; \ + }) + +#define GiFixLenType2GiUint8Type(s) \ + __extension__({ \ + GI_UINT8_t d; \ + d = vle8_v_u8m1((uint8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); \ + d; \ + }) + +#define GiInt8Type2FixLenType(s) \ + __extension__({ \ + GI_INT8_FIXLEN_t d; \ + vse8_v_i8m1((int8_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + d; \ + }) + +#define GiFixLenType2GiInt8Type(s) \ + __extension__({ \ + GI_INT8_t d; \ + d = vle8_v_i8m1((int8_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + d; \ + }) + +#define GiInt16Type2FixLenType(s) \ + __extension__({ \ + GI_INT16_FIXLEN_t d; \ + vse16_v_i16m1((int16_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \ + d; \ + }) + +#define GiFixLenType2GiInt16Type(s) \ + __extension__({ \ + GI_INT16_t d; \ + d = vle16_v_i16m1((int16_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \ + d; \ + }) + +#define GiInt32Type2FixLenType(s) \ + __extension__({ \ + GI_INT32_FIXLEN_t d; \ + vse32_v_i32m1((int32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ + d; \ + }) + +#define GiFixLenType2GiInt32Type(s) \ + __extension__({ \ + GI_INT32_t d; \ + d = vle32_v_i32m1((int32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ + d; \ + }) + +#define GiUint32Type2FixLenType(s) \ + __extension__({ \ + GI_UINT32_FIXLEN_t d; \ + vse32_v_u32m1((uint32_t*)&d, s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \ + d; \ + }) + +#define GiFixLenType2GiUint32Type(s) \ + __extension__({ \ + GI_UINT32_t d; \ + d = vle32_v_u32m1((uint32_t*)&s, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); \ + d; \ + }) #else - return Vector1 & Vector2; +typedef GI_FLOAT32_t GI_FLOAT32_FIXLEN_t; +typedef GI_FLOAT32_V2_t GI_FLOAT32_FIXLEN_V2_t; +typedef GI_UINT8_t GI_UINT8_FIXLEN_t; +typedef GI_INT8_t GI_INT8_FIXLEN_t; +typedef GI_INT16_t GI_INT16_FIXLEN_t; +typedef GI_INT32_t GI_INT32_FIXLEN_t; +typedef GI_UINT32_t GI_UINT32_FIXLEN_t; +#define GiFloat32Type2FixLenType(s) (s) +#define GiFixLenType2GiFloat32Type(s) (s) + +#define GiFloat32Type2FixLenV2Type(s) (s) +#define GiFixLenType2GiFloat32V2Type(s) (s) + +#define GiUint8Type2FixLenType(s) (s) +#define GiFixLenType2GiUint8Type(s) (s) + +#define GiInt8Type2FixLenType(s) (s) +#define GiFixLenType2GiInt8Type(s) (s) + +#define GiInt16Type2FixLenType(s) (s) +#define GiFixLenType2GiInt16Type(s) (s) + +#define GiInt32Type2FixLenType(s) (s) +#define GiFixLenType2GiInt32Type(s) (s) + +#define GiUint32Type2FixLenType(s) (s) +#define GiFixLenType2GiUint32Type(s) (s) + +//! get subvector +#define GiGetSubVectorFloat32V2(s, index) s.val[index] +#define GiGetSubVectorFloat32V3(s, index) s.val[index] +#define GiGetSubVectorFloat32V4(s, index) s.val[index] + +#define GiGetSubVectorInt32V2(s, index) s.val[index] +#define GiGetSubVectorInt32V4(s, index) s.val[index] + +#define GiGetSubVectorInt16V2(s, index) s.val[index] + +#define GiGetSubVectorInt8V2(s, index) s.val[index] + +//! insert subvector +#define GiSetSubVectorFloat32V2(d, index, s) d.val[index] = s +#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s +#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s + +#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s +#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s + +#define GiSetSubVectorInt16V2(d, index, s) d.val[index] = s + +#define GiSetSubVectorInt8V2(d, index, s) d.val[index] = s #endif -} -GI_FORCEINLINE -GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { +#define Max(a, b) (a) > (b) ? (a) : (b) +#define Min(a, b) (a) < (b) ? (a) : (b) + #if defined(GI_NEON_INTRINSICS) - return vorrq_s32(Vector1, Vector2); -#elif defined(GI_SSE2_INTRINSICS) - return _mm_or_si128(Vector1, Vector2); +#if defined(__ARM_FEATURE_FMA) && defined(GI_NEON64_INTRINSICS) +#define v_fma_ps_f32(c, b, a) vfmaq_f32((c), (b), (a)) +#define v_fma_n_f32(c, b, a) vfmaq_n_f32((c), (b), (a)) +#define v_fma_lane_f32(c, b, a, lane) vfmaq_lane_f32((c), (b), (a), (lane)) #else - return Vector1 | Vector2; +#define v_fma_ps_f32(c, b, a) vmlaq_f32((c), (b), (a)) +#define v_fma_n_f32(c, b, a) vmlaq_n_f32((c), (b), (a)) +#define v_fma_lane_f32(c, b, a, lane) vmlaq_lane_f32((c), (b), (a), (lane)) +#endif #endif -} GI_FORCEINLINE -GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { -#if defined(GI_NEON_INTRINSICS) - return vandq_s32(vmvnq_s32(VectorNot), Vector); -#elif defined(GI_SSE2_INTRINSICS) - return _mm_andnot_si128(VectorNot, Vector); -#else - return (~VectorNot) & Vector; +enum GiSimdType GiGetSimdType() { + //! override by special macro to insure ci have test naive and sse2 + //! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42 + //! now arm ci device will test GI_NEON + //! insure test GI_SSE2 by command: + //! --copt -march=core2 --copt -mno-sse4.2 + //! --copt -mno-sse3 --copt -DGI_TEST_SSE2 + //! insure test GI_NAIVE by command: + //! --copt -DGI_TEST_SSE2 + //! DNN code at least need sse2 at x86 + //! so we can not test GI_NAIVE by + //! --copt -march=core2 --copt -mno-sse4.2 + //! --copt -mno-sse3 --copt -mno-sse2 + //! --copt -DGI_TEST_NAIVE + //! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by + //! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh +#if defined(GI_TEST_NAIVE) +#undef __gi_simd_type +#define __gi_simd_type GI_NAIVE +#elif defined(GI_TEST_SSE2) +#undef __gi_simd_type +#define __gi_simd_type GI_SSE2 #endif + + return __gi_simd_type; } GI_FORCEINLINE -GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { +GI_FLOAT32_t GiBroadcastFloat32(float Value) { #if defined(GI_NEON_INTRINSICS) - return veorq_s32(Vector1, Vector2); + return vdupq_n_f32(Value); #elif defined(GI_SSE2_INTRINSICS) - return _mm_xor_si128(Vector1, Vector2); + return _mm_set1_ps(Value); +#elif defined(GI_RVV_INTRINSICS) + return vfmv_v_f_f32m1(Value, GI_SIMD_LEN_BYTE / sizeof(float)); #else - return Vector1 ^ Vector2; + GI_FLOAT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = Value; + } + return ret; #endif } GI_FORCEINLINE -GI_FLOAT32_t GiBroadcastFloat32(float Value) { +GI_INT8_t GiBroadcastInt8(int8_t Value) { #if defined(GI_NEON_INTRINSICS) - return vdupq_n_f32(Value); + return vdupq_n_s8(Value); #elif defined(GI_SSE2_INTRINSICS) - return _mm_set1_ps(Value); + return _mm_set1_epi8(Value); +#elif defined(GI_RVV_INTRINSICS) + return vmv_v_x_i8m1(Value, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else - GI_FLOAT32_t ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + GI_INT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { ret[i] = Value; } return ret; @@ -411,6 +644,8 @@ GI_INT32_t GiBroadcastInt32(int32_t Value) { return vdupq_n_s32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi32(Value); +#elif defined(GI_RVV_INTRINSICS) + return vmv_v_x_i32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -421,54 +656,55 @@ GI_INT32_t GiBroadcastInt32(int32_t Value) { } GI_FORCEINLINE -GI_INT8_t GiBroadcastInt8(int8_t Value) { +GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) - return vdupq_n_s8(Value); + return vandq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) - return _mm_set1_epi8(Value); + return _mm_and_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vand_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else - GI_INT8_t ret; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { - ret[i] = Value; - } - return ret; + return Vector1 & Vector2; #endif } GI_FORCEINLINE -GiSimdType GiGetSimdType() { - //! override by special macro to insure ci have test naive and sse2 - //! now we do not imp GI_AVX to now and x64 ci device will test GI_SSE42 - //! now arm ci device will test GI_NEON - //! insure test GI_SSE2 by command: - //! --copt -march=core2 --copt -mno-sse4.2 - //! --copt -mno-sse3 --copt -DGI_TEST_SSE2 - //! insure test GI_NAIVE by command: - //! --copt -DGI_TEST_SSE2 - //! DNN code at least need sse2 at x86 - //! so we can not test GI_NAIVE by - //! --copt -march=core2 --copt -mno-sse4.2 - //! --copt -mno-sse3 --copt -mno-sse2 - //! --copt -DGI_TEST_NAIVE - //! about CMake, can override build flags to CMAKE_CXX_FLAGS/CMAKE_C_FLAGS by - //! EXTRA_CMAKE_ARGS when use scripts/cmake-build/*.sh -#if defined(GI_TEST_NAIVE) -#undef __gi_simd_type -#define __gi_simd_type GI_NAIVE -#elif defined(GI_TEST_SSE2) -#undef __gi_simd_type -#define __gi_simd_type GI_SSE2 +GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vorrq_s32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_or_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + return Vector1 | Vector2; #endif - - return __gi_simd_type; } -__attribute__((unused)) const GI_INT8_t vzero_int8 = GiBroadcastInt8(0); -__attribute__((unused)) const GI_INT32_t vzero = GiBroadcastInt32(0); -__attribute__((unused)) const GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); -__attribute__((unused)) const GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); -__attribute__((unused)) const GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); -__attribute__((unused)) const GI_FLOAT32_t vfmin_int8 = GiBroadcastFloat32(-128.0f); -__attribute__((unused)) const GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); +GI_FORCEINLINE +GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vandq_s32(vmvnq_s32(VectorNot), Vector); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_andnot_si128(VectorNot, Vector); +#elif defined(GI_RVV_INTRINSICS) + GI_INT32_t not_v = vnot_v_i32m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + return vand_vv_i32m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + return (~VectorNot) & Vector; +#endif +} +GI_FORCEINLINE +GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return veorq_s32(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_xor_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vxor_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + return Vector1 ^ Vector2; +#endif +} // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/general_intrinsic/gi_float.h b/dnn/src/fallback/general_intrinsic/gi_float.h index be5b2812..bbde866a 100644 --- a/dnn/src/fallback/general_intrinsic/gi_float.h +++ b/dnn/src/fallback/general_intrinsic/gi_float.h @@ -8,6 +8,8 @@ GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { return vreinterpretq_s32_f32(In); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(In); +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_f32m1_i32m1(In); #else return (GI_INT32_t)In; #endif @@ -19,6 +21,8 @@ GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { return vreinterpretq_u32_f32(In); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(In); +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_f32m1_u32m1(In); #else return (GI_UINT32_t)In; #endif @@ -30,6 +34,8 @@ GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { return vreinterpretq_f32_s32(Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_castsi128_ps(Vector); +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_i32m1_f32m1(Vector); #else return (GI_FLOAT32_t)Vector; #endif @@ -41,6 +47,8 @@ GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { return vreinterpretq_f32_u32(Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_castsi128_ps(Vector); +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_u32m1_f32m1(Vector); #else return (GI_FLOAT32_t)Vector; #endif @@ -52,12 +60,18 @@ GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { #if __ARM_ARCH >= 8 return vcvtaq_s32_f32(Vector); #else - float32x4_t vinc0 = vbslq_f32(vcgeq_f32(Vector, vfzero), vfhalf, vfneg_half); + float32x4_t vinc0 = vbslq_f32( + vcgeq_f32(Vector, GiBroadcastFloat32(0.0f)), GiBroadcastFloat32(0.5f), + GiBroadcastFloat32(-0.5f)); return vcvtq_s32_f32(vaddq_f32(Vector, vinc0)); #endif #elif defined(GI_SSE42_INTRINSICS) - __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(Vector, vfzero)); + __m128 vinc0 = _mm_blendv_ps( + GiBroadcastFloat32(-0.5f), GiBroadcastFloat32(0.5f), + _mm_cmpge_ps(Vector, GiBroadcastFloat32(0.0f))); return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0)); +#elif defined(GI_RVV_INTRINSICS) + return vfcvt_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_INT32_t ret; GI_INT32_NAIVE_t tmp_ret; @@ -77,6 +91,16 @@ GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { return vcvtq_s32_f32(Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_cvttps_epi32(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 + //! as a workaround, we imp this API by naive + //! return vfcvt_rtz_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); + GI_FLOAT32_FIXLEN_t src = GiFloat32Type2FixLenType(Vector); + GI_INT32_FIXLEN_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = (int32_t)(src[i]); + } + return GiFixLenType2GiInt32Type(ret); #else GI_INT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -92,6 +116,8 @@ GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { return vcvtq_f32_s32(Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_cvtepi32_ps(Vector); +#elif defined(GI_RVV_INTRINSICS) + return vfcvt_f_x_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -107,6 +133,8 @@ GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { return vld1q_dup_f32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_load_ps1(Value); +#elif defined(GI_RVV_INTRINSICS) + return GiBroadcastFloat32(*Value); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -136,6 +164,8 @@ GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { return _mm_load_ps(Buffer); else return _mm_loadu_ps(Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -151,8 +181,9 @@ GI_FLOAT32_V2_t GiLoadFloat32V2(const float* Buffer) { return vld1q_f32_x2(Buffer); #else GI_FLOAT32_V2_t v; - v.val[0] = GiLoadFloat32(Buffer); - v.val[1] = GiLoadFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float)); + GiSetSubVectorFloat32V2(v, 0, GiLoadFloat32(Buffer)); + GiSetSubVectorFloat32V2( + v, 1, GiLoadFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float))); return v; #endif @@ -171,6 +202,8 @@ GI_FLOAT32_t GiLoadFloat32LowHalf(const float* Buffer) { high.m64_f32[1] = 0; __m128i res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high)); return _M128(res); +#elif defined(GI_RVV_INTRINSICS) + return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float) / 2); #else GI_FLOAT32_t ret; memset(&ret, 0, sizeof(GI_FLOAT32_t)); @@ -194,6 +227,8 @@ GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) { __m128 res; res = _mm_mul_ps(c, b); return _mm_add_ps(a, res); +#elif defined(GI_RVV_INTRINSICS) + return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -211,6 +246,14 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiUzpqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b) { v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); return v32x4; +#elif defined(GI_RVV_INTRINSICS) + //! may need optimize + float tmp[GI_SIMD_LEN_BYTE / sizeof(float) * 2] = {0}; + vse32_v_f32m1(tmp, a, GI_SIMD_LEN_BYTE / sizeof(float)); + vse32_v_f32m1( + tmp + GI_SIMD_LEN_BYTE / sizeof(float), b, + GI_SIMD_LEN_BYTE / sizeof(float)); + return vlseg2e32_v_f32m1x2(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_V2_t ret; ret.val[0][0] = a[0]; @@ -233,6 +276,8 @@ GI_FORCEINLINE float32x2_t GiDupFloat32(float a) { res.m64_f32[0] = a; res.m64_f32[1] = a; return res; +#elif defined(GI_RVV_INTRINSICS) + return GiBroadcastFloat32(a); #else float32x2_t res; res[0] = a; @@ -249,6 +294,8 @@ GI_FORCEINLINE float32x2_t GiLdFloat32(float const* ptr) { res.m64_f32[0] = *(ptr); res.m64_f32[1] = *(ptr + 1); return res; +#elif defined(GI_RVV_INTRINSICS) + return vle32_v_f32m1(ptr, 2); #else float32x2_t res; res[0] = *(ptr); @@ -266,6 +313,8 @@ GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) { res = _mm_add_ps(_pM128(a), _pM128(b)); // SSE, use only low 64 bits _M64f(res64, res); return res64; +#elif defined(GI_RVV_INTRINSICS) + return vfadd_vv_f32m1(a, b, 2); #else float32x2_t res; res[0] = a[0] + b[0]; @@ -280,6 +329,10 @@ GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) { GI_FORCEINLINE float __gi_vget_lane_f32(float32x2_t v, const int lane) { #if defined(GI_SSE2_INTRINSICS) return _sse_vget_lane_f32(v, lane); +#elif defined(GI_RVV_INTRINSICS) + float ret[2]; + vse32_v_f32m1(ret, v, 2); + return ret[lane]; #else return v[lane]; #endif @@ -297,6 +350,11 @@ __gi_vset_lane_f32(float32_t value, float32x2_t vec, int lane) { res = vec; res.m64_f32[lane] = value; return res; +#elif defined(GI_RVV_INTRINSICS) + float tmp[2]; + vse32_v_f32m1(tmp, vec, 2); + tmp[lane] = value; + return vle32_v_f32m1(tmp, 2); #else float32x2_t res; res = vec; @@ -314,6 +372,8 @@ GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) { *(ptr) = val.m64_f32[0]; *(ptr + 1) = val.m64_f32[1]; return; +#elif defined(GI_RVV_INTRINSICS) + return vse32_v_f32m1(ptr, val, 2); #else *(ptr) = val[0]; *(ptr + 1) = val[1]; @@ -330,6 +390,8 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) { v.val[1] = GiLoadFloat32((Buffer + 4)); v = GiUzpqFloat32(v.val[0], v.val[1]); return v; +#elif defined(GI_RVV_INTRINSICS) + return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_V2_t ret; ret.val[0][0] = Buffer[0]; @@ -351,6 +413,16 @@ GI_FORCEINLINE GI_FLOAT32_V2_t GiLd2qFloat32(const float* Buffer) { #else GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) { +#if defined(GI_RVV_INTRINSICS) + int t_count = GI_SIMD_LEN_BYTE / sizeof(float); + int a_count = t_count - n; + float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; + float tmp_a[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(tmp_a, a, GI_SIMD_LEN_BYTE / sizeof(float)); + memcpy(tmp, tmp_a + n, a_count * sizeof(float)); + vse32_v_f32m1(tmp + a_count, b, n); + return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); +#else GI_FLOAT32_t ret; int t_count = GI_SIMD_LEN_BYTE / sizeof(float); int a_count = t_count - n; @@ -361,6 +433,7 @@ __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) { ret[i + a_count] = b[i]; } return ret; +#endif } #define GiExtqFloat32(a, b, n) __naive_gi_vextq_f32(a, b, n) #endif @@ -372,6 +445,9 @@ GI_FLOAT32_t GiMultiplySubFloat32( return vmlsq_f32(VectorSum, Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + return vfnmsub_vv_f32m1( + Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -449,6 +525,12 @@ __gi_vld1q_lane_f32(const float* Buffer, GI_FLOAT32_t src, const int n) { GI_FLOAT32_t p; p = _mm_set1_ps(*(Buffer)); return _MM_INSERT_PS(src, p, _INSERTPS_NDX(0, n)); +#elif defined(GI_RVV_INTRINSICS) + //! mask will use more instruct + float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(tmp, src, GI_SIMD_LEN_BYTE / sizeof(float)); + tmp[n] = *Buffer; + return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; memcpy(&ret, &src, sizeof(GI_FLOAT32_t)); @@ -479,11 +561,20 @@ __gi_vsetq_lane_f32(float value, GI_FLOAT32_t vec, const int lane) { #else GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half( GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { +#if defined(GI_RVV_INTRINSICS) + float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float)); + + return vfmadd_vf_f32m1( + b, tmp[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2], a, + GI_SIMD_LEN_BYTE / sizeof(float)); +#else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { - ret[i] = a[i] + (b[i] * v[lane + 2]); + ret[i] = a[i] + (b[i] * v[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2]); } return ret; +#endif } #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \ __naive_gi_vmlaq_lane_f32_high_half(a, b, v, lane) @@ -498,11 +589,18 @@ GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half( #else GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_low_half( GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) { +#if defined(GI_RVV_INTRINSICS) + float tmp[GI_SIMD_LEN_BYTE / sizeof(float) / 2]; + vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float) / 2); + + return vfmadd_vf_f32m1(b, tmp[lane], a, GI_SIMD_LEN_BYTE / sizeof(float)); +#else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { ret[i] = a[i] + (b[i] * v[lane]); } return ret; +#endif } #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \ __naive_gi_vmlaq_lane_f32_low_half(a, b, v, lane) @@ -514,6 +612,8 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { vst1q_f32(Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) _mm_storeu_ps(Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse32_v_f32m1(Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); #else for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { Buffer[i] = Vector[i]; @@ -526,8 +626,10 @@ void GiStoreFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1q_f32_x2(Buffer, Vector); #else - GiStoreFloat32(Buffer, Vector.val[0]); - GiStoreFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float), Vector.val[1]); + GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(Vector, 0)); + GiStoreFloat32( + Buffer + GI_SIMD_LEN_BYTE / sizeof(float), + GiGetSubVectorFloat32V2(Vector, 1)); #endif } @@ -543,6 +645,14 @@ void GiStoreFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ } +#elif defined(GI_RVV_INTRINSICS) + +#define GISTORELANEFLOAT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ + float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \ + vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \ + *Buffer = tmp[i]; \ + } #else #define GISTORELANEFLOAT32(i) \ GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ @@ -568,6 +678,14 @@ GISTORELANEFLOAT32(3) GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ } +#elif defined(GI_RVV_INTRINSICS) + +#define GIEXTRACTLANEFLOAT32(i) \ + GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ + float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \ + vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \ + return tmp[i]; \ + } #else #define GIEXTRACTLANEFLOAT32(i) \ GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ @@ -590,6 +708,26 @@ GI_FLOAT32_V2_t GiZipqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { f32x4.val[0] = _mm_unpacklo_ps(Vector1, Vector2); f32x4.val[1] = _mm_unpackhi_ps(Vector1, Vector2); return f32x4; +#elif defined(GI_RVV_INTRINSICS) + vfloat32m2_t d = vundefined_f32m2(); + d = vset_v_f32m1_f32m2(d, 0, Vector1); + d = vset_v_f32m1_f32m2(d, 1, Vector2); + vuint32m2_t index; +#if GI_SIMD_LEN_BYTE == 16 + uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + index = vle32_v_u32m2(index_128, 8); +#else + uint32_t* index_p = (uint32_t*)&index; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + index_p[2 * i] = i; + index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); + } +#endif + vfloat32m2_t g_d = + vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); + vfloat32m1_t v0 = vget_v_f32m2_f32m1(g_d, 0); + vfloat32m1_t v1 = vget_v_f32m2_f32m1(g_d, 1); + return vcreate_f32m1x2(v0, v1); #else GI_FLOAT32_V2_t ret; ret.val[0][0] = Vector1[0]; @@ -610,9 +748,11 @@ void GiStoreZipFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) { vst2q_f32(Buffer, Vector); #else GI_FLOAT32_V2_t tmp; - tmp = GiZipqFloat32(Vector.val[0], Vector.val[1]); - GiStoreFloat32(Buffer, tmp.val[0]); - GiStoreFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float), tmp.val[1]); + tmp = GiZipqFloat32( + GiGetSubVectorFloat32V2(Vector, 0), GiGetSubVectorFloat32V2(Vector, 1)); + GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(tmp, 0)); + GiStoreFloat32( + Buffer + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V2(tmp, 1)); #endif } @@ -625,6 +765,24 @@ GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) return zipped.val[0]; #elif defined(GI_SSE2_INTRINSICS) return _mm_unpacklo_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + vfloat32m2_t d = vundefined_f32m2(); + d = vset_v_f32m1_f32m2(d, 0, Vector1); + d = vset_v_f32m1_f32m2(d, 1, Vector2); + vuint32m2_t index; +#if GI_SIMD_LEN_BYTE == 16 + uint32_t index_128[4] = {0, 4, 1, 5}; + index = vle32_v_u32m2(index_128, 4); +#else + uint32_t* index_p = (uint32_t*)&index; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float) / 2; i++) { + index_p[2 * i] = i; + index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); + } +#endif + vfloat32m2_t g_d = + vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); + return vget_v_f32m2_f32m1(g_d, 0); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { @@ -644,6 +802,24 @@ GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) return zipped.val[1]; #elif defined(GI_SSE2_INTRINSICS) return _mm_unpackhi_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + vfloat32m2_t d = vundefined_f32m2(); + d = vset_v_f32m1_f32m2(d, 0, Vector1); + d = vset_v_f32m1_f32m2(d, 1, Vector2); + vuint32m2_t index; +#if GI_SIMD_LEN_BYTE == 16 + uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + index = vle32_v_u32m2(index_128, 8); +#else + uint32_t* index_p = (uint32_t*)&index; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + index_p[2 * i] = i; + index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); + } +#endif + vfloat32m2_t g_d = + vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2); + return vget_v_f32m2_f32m1(g_d, 1); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { @@ -660,6 +836,8 @@ GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vaddq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfadd_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 + Vector2; #endif @@ -671,6 +849,8 @@ GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vsubq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_sub_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfsub_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 - Vector2; #endif @@ -682,6 +862,8 @@ GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vmulq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_mul_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmul_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 * Vector2; #endif @@ -694,6 +876,8 @@ GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { #elif defined(GI_SSE2_INTRINSICS) GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); return _mm_mul_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmul_vf_f32m1(Vector1, Scaler, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 * Scaler; #endif @@ -708,6 +892,9 @@ GI_FLOAT32_t GiMultiplyAddFloat32( return _mm_fmadd_ps(Vector1, Vector2, VectorSum); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum); +#elif defined(GI_RVV_INTRINSICS) + return vfmadd_vv_f32m1( + Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 * Vector2 + VectorSum; #endif @@ -720,6 +907,8 @@ GI_FLOAT32_t GiMultiplyAddScalarFloat32( return v_fma_n_f32(VectorSum, Vector, Scalar); #elif defined(GI_SSE2_INTRINSICS) return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); +#elif defined(GI_RVV_INTRINSICS) + return vfmadd_vf_f32m1(Vector, Scalar, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float)); #else return VectorSum + Vector * Scalar; #endif @@ -767,6 +956,8 @@ GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vmulq_f32(Vector1, recp); #elif defined(GI_SSE2_INTRINSICS) return _mm_div_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else return Vector1 / Vector2; #endif @@ -779,6 +970,9 @@ GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #elif defined(GI_SSE2_INTRINSICS) GI_FLOAT32_t two = _mm_set1_ps(2.0f); return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + GI_FLOAT32_t two = GiBroadcastFloat32(2.0f); + return vfnmsub_vv_f32m1(Vector1, Vector2, two, GI_SIMD_LEN_BYTE / sizeof(float)); #else return (2.0f - Vector1 * Vector2); #endif @@ -791,6 +985,9 @@ GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { #elif defined(GI_SSE2_INTRINSICS) GI_FLOAT32_t ones = _mm_set1_ps(1.0f); return _mm_div_ps(ones, Vector); +#elif defined(GI_RVV_INTRINSICS) + GI_FLOAT32_t ones = GiBroadcastFloat32(1.0f); + return vfdiv_vv_f32m1(ones, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); #else //! FIXME: neon or sse always have low accuracy than 1/x return 1 / Vector; @@ -804,6 +1001,8 @@ GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { #elif defined(GI_SSE2_INTRINSICS) GI_FLOAT32_t zero = _mm_set1_ps(0.0f); return _mm_sub_ps(zero, Vector); +#elif defined(GI_RVV_INTRINSICS) + return vfneg_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float)); #else return -Vector; #endif @@ -815,6 +1014,12 @@ GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vcgtq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + vbool32_t b = + vmfgt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); + GI_UINT32_t ret; + memcpy(&ret, &b, GI_SIMD_LEN_BYTE); + return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_UINT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -830,6 +1035,12 @@ GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vcleq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + vbool32_t b = + vmfle_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); + GI_UINT32_t ret; + memcpy(&ret, &b, GI_SIMD_LEN_BYTE); + return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_UINT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -845,6 +1056,12 @@ GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vcltq_f32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + vbool32_t b = + vmflt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); + GI_UINT32_t ret; + memcpy(&ret, &b, GI_SIMD_LEN_BYTE); + return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_UINT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -920,6 +1137,8 @@ GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vmaxq_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) return _mm_max_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t max; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -935,6 +1154,8 @@ GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { return vminq_f32(Vector1, Vector2); #elif defined(GI_NEON32_INTRINSICS) return _mm_min_ps(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t min; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -948,6 +1169,15 @@ GI_FORCEINLINE GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vmaxq_f32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + //! vfmax_vv_f32m1 NAN logic is not same with NEON, imp with naive + GI_FLOAT32_FIXLEN_t a, b, ret; + a = GiFloat32Type2FixLenType(Vector1); + b = GiFloat32Type2FixLenType(Vector2); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = MAX_NAN(a[i], b[i]); + } + return GiFixLenType2GiFloat32Type(ret); #else //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code @@ -963,6 +1193,15 @@ GI_FORCEINLINE GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { #if defined(GI_NEON_INTRINSICS) return vminq_f32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + //! vfmin_vv_f32m1 NAN logic is not same with NEON, imp with naive + GI_FLOAT32_FIXLEN_t a, b, ret; + a = GiFloat32Type2FixLenType(Vector1); + b = GiFloat32Type2FixLenType(Vector2); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret[i] = MIN_NAN(a[i], b[i]); + } + return GiFixLenType2GiFloat32Type(ret); #else //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so //! implement by C code @@ -999,6 +1238,12 @@ float GiReduceAddFloat32(GI_FLOAT32_t Vector) { Vector = GiAddFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); +#elif defined(GI_RVV_INTRINSICS) + vfloat32m1_t redsum = vundefined_f32m1(); + //! use Ordered sum, may Unordered sum more fast with vfredusum_vs_f32m1_f32m1 + redsum = vfredosum_vs_f32m1_f32m1( + redsum, Vector, GiBroadcastFloat32(0.0f), GI_SIMD_LEN_BYTE / sizeof(float)); + return GiExtractLane0Float32(redsum); #else float ret = 0; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -1021,6 +1266,14 @@ float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { Vector = GiMultiplyFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! RVV do not have reduce mul, imp with naive + float ret = 1; + GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret *= v[i]; + } + return ret; #else float ret = 1; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -1049,6 +1302,14 @@ float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { Vector = GiMaxNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! vfredmax_vs_f32m1_f32m1 can not handle NAN case, imp with naive + GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); + float ret = v[0]; + for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret = MAX_NAN(ret, v[i]); + } + return ret; #else float ret = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -1074,6 +1335,14 @@ float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { Vector = GiMinNanFloat32( Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); return GiExtractLane0Float32(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! vfredmin_vs_f32m1_f32m1 can not handle NAN case, imp with naive + GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector); + float ret = v[0]; + for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + ret = MIN_NAN(ret, v[i]); + } + return ret; #else float ret = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -1094,6 +1363,8 @@ GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { } value; value.int_val = 0x7fffffff; return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); +#elif defined(GI_RVV_INTRINSICS) + return vfabs_v_f32m1(Vector1, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { @@ -1156,6 +1427,8 @@ GI_FORCEINLINE GI_FLOAT32_t GiReinterpretqS64ToFloat32(GI_INT64_t a) { return vreinterpretq_f32_s64(a); #elif defined(GI_SSE2_INTRINSICS) return _M128(a); +#elif defined(GI_RVV_INTRINSICS) + return vle32_v_f32m1((float*)&a, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t ret; memcpy(&ret, &a, sizeof(GI_FLOAT32_t)); @@ -1168,6 +1441,10 @@ GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) { return vreinterpretq_s64_f32(a); #elif defined(GI_SSE2_INTRINSICS) return _M128i(a); +#elif defined(GI_RVV_INTRINSICS) + GI_INT64_t ret; + vse32_v_f32m1((float*)&ret, a, GI_SIMD_LEN_BYTE / sizeof(float)); + return ret; #else GI_INT64_t ret; memcpy(&ret, &a, sizeof(GI_INT64_t)); @@ -1177,6 +1454,16 @@ GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) { #if defined(GI_NEON_INTRINSICS) #define GiSimdFmaLane(a, b, c, d) vfmaq_laneq_f32(a, b, c, d) +#elif defined(GI_RVV_INTRINSICS) +#define __rvv_fmaq_laneq_f32(__a, __b, __c, __lane) \ + __extension__({ \ + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \ + vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \ + GI_FLOAT32_t __ret = vfmadd_vf_f32m1( \ + __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \ + __ret; \ + }) +#define GiSimdFmaLane(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) #else GI_FORCEINLINE GI_FLOAT32_t ___gi_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) { @@ -1262,6 +1549,9 @@ ___gi_vfmaq_laneq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, int l) { __ret; \ }) +#elif defined(GI_RVV_INTRINSICS) +#define GiMlaqLowLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) +#define GiMlaqHighLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d) #else //! naive #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \ @@ -1303,6 +1593,16 @@ SSE_VFMSQ_LANEQ_F32(2) SSE_VFMSQ_LANEQ_F32(3) #undef SSE_VFMSQ_LANEQ_F32 #define GiFmsqLaneQFloat32(a, b, v, lane) sse_vfmsq_lane_##lane##_q_f32(a, b, v) +#elif defined(GI_RVV_INTRINSICS) +#define __rvv_fmsq_lane_float32(__a, __b, __c, __lane) \ + __extension__({ \ + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \ + vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \ + GI_FLOAT32_t __ret = vfnmsub_vf_f32m1( \ + __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \ + __ret; \ + }) +#define GiFmsqLaneQFloat32(a, b, c, d) __rvv_fmsq_lane_float32(a, b, c, d) #else //! naive GI_FORCEINLINE GI_FLOAT32_t __naive_GiFmsqLaneQFloat32( @@ -1324,6 +1624,11 @@ GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) { __m128i res; res = _mm_unpacklo_epi64(_pM128i(a), _pM128i(b)); return _M128(res); +#elif defined(GI_RVV_INTRINSICS) + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(t, a, 2); + vse32_v_f32m1(t + 2, b, 2); + return vle32_v_f32m1(t, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_t res; res[0] = a[0]; @@ -1337,6 +1642,8 @@ GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) { GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) { #if defined(GI_NEON_INTRINSICS) return vget_low_f32(a); +#elif defined(GI_RVV_INTRINSICS) + return vmv_v_v_f32m1(a, 2); #else return ___gi_vget_low_f32(a); #endif @@ -1345,6 +1652,12 @@ GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) { GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) { #if defined(GI_NEON_INTRINSICS) return vget_high_f32(a); +#elif defined(GI_RVV_INTRINSICS) + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(t, a, GI_SIMD_LEN_BYTE / sizeof(float)); + return vle32_v_f32m1( + t + GI_SIMD_LEN_BYTE / sizeof(float) / 2, + GI_SIMD_LEN_BYTE / sizeof(float) / 2); #else return ___gi_vget_high_f32(a); #endif @@ -1358,6 +1671,13 @@ GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) { res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1]; res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1]; return res; +#elif defined(GI_RVV_INTRINSICS) + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(t, a, 2); + vse32_v_f32m1(t + 2, b, 2); + t[0] = t[0] + t[1]; + t[1] = t[2] + t[3]; + return vle32_v_f32m1(t, 2); #else float32x2_t res; res[0] = a[0] + a[1]; @@ -1374,6 +1694,13 @@ GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) { res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]); res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]); return res; +#elif defined(GI_RVV_INTRINSICS) + float t[GI_SIMD_LEN_BYTE / sizeof(float)]; + vse32_v_f32m1(t, a, 2); + vse32_v_f32m1(t + 2, b, 2); + t[0] = MAX_NAN(t[0], t[1]); + t[1] = MAX_NAN(t[2], t[3]); + return vle32_v_f32m1(t, 2); #else float32x2_t res; res[0] = MAX_NAN(a[0], a[1]); @@ -1408,6 +1735,8 @@ GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) { v.val[1] = _mm_movehl_ps(tmp3, v.val[1]); v.val[2] = _mm_movehl_ps(tmp2, tmp0); return v; +#elif defined(GI_RVV_INTRINSICS) + return vlseg3e32_v_f32m1x3(ptr, GI_SIMD_LEN_BYTE / sizeof(float)); #else GI_FLOAT32_V3_t ret; for (size_t i = 0; i < 3; i++) { @@ -1440,6 +1769,35 @@ void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) { GiStoreFloat32(ptr, v.val[0]); GiStoreFloat32((ptr + 4), v.val[1]); GiStoreFloat32((ptr + 8), v.val[2]); +#elif defined(GI_RVV_INTRINSICS) + vfloat32m4_t d = vundefined_f32m4(); + d = vset_v_f32m1_f32m4(d, 0, GiGetSubVectorFloat32V3(val, 0)); + d = vset_v_f32m1_f32m4(d, 1, GiGetSubVectorFloat32V3(val, 1)); + d = vset_v_f32m1_f32m4(d, 2, GiGetSubVectorFloat32V3(val, 2)); + vuint32m4_t index; +#if GI_SIMD_LEN_BYTE == 16 + uint32_t index_128[16] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 0, 0, 0, 0}; + index = vle32_v_u32m4(index_128, 16); +#else + uint32_t* index_p = (uint32_t*)&index; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { + index_p[3 * i] = i; + index_p[3 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float); + index_p[3 * i + 2] = i + GI_SIMD_LEN_BYTE / sizeof(float) * 2; + } +#endif + vfloat32m4_t g_d = + vrgather_vv_f32m4(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 3); + vfloat32m1_t v0 = vget_v_f32m4_f32m1(g_d, 0); + vfloat32m1_t v1 = vget_v_f32m4_f32m1(g_d, 1); + vfloat32m1_t v2 = vget_v_f32m4_f32m1(g_d, 2); + GI_FLOAT32_V3_t tmp = vcreate_f32m1x3(v0, v1, v2); + GiStoreFloat32(ptr, GiGetSubVectorFloat32V3(tmp, 0)); + GiStoreFloat32( + ptr + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V3(tmp, 1)); + GiStoreFloat32( + ptr + GI_SIMD_LEN_BYTE / sizeof(float) * 2, + GiGetSubVectorFloat32V3(tmp, 2)); #else for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { *ptr++ = val.val[0][i]; @@ -1448,3 +1806,13 @@ void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) { } #endif } + +GI_FORCEINLINE +GI_FLOAT32_t GiDivFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { +#if defined(GI_RVV_INTRINSICS) + return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float)); +#else + //! neon, ssex and naive can auto call builtin function + return Vector1 / Vector2; +#endif +} diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index 7f9d61fd..5766bfa1 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -2,12 +2,27 @@ #include "gi_common.h" +GI_FORCEINLINE +GI_INT32_t GiReinterpretInt8AsInt32(GI_INT8_t In) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s32_s8(In); +#elif defined(GI_SSE2_INTRINSICS) + return (GI_INT32_t)In; +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_i8m1_i32m1(In); +#else + return (GI_INT32_t)In; +#endif +} + GI_FORCEINLINE GI_UINT32_t GiBroadcastUint32(int32_t Value) { #if defined(GI_NEON_INTRINSICS) return vdupq_n_u32(Value); #elif defined(GI_SSE2_INTRINSICS) return _mm_set1_epi32(Value); +#elif defined(GI_RVV_INTRINSICS) + return vmv_v_x_u32m1(Value, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_UINT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -23,6 +38,8 @@ GI_INT32_t GiLoadInt32(const void* Buffer) { return vld1q_s32((int32_t*)Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_si128((const __m128i*)Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle32_v_i32m1((int32_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_INT32_t ret; const int32_t* ptr = (int32_t*)Buffer; @@ -39,6 +56,8 @@ GI_INT16_t GiLoadInt16(const void* Buffer) { return vld1q_s16((int16_t*)Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_si128((const __m128i*)Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle16_v_i16m1((int16_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int16_t)); #else GI_INT16_t ret; const int16_t* ptr = (int16_t*)Buffer; @@ -55,6 +74,8 @@ GI_INT8_t GiLoadInt8(const void* Buffer) { return vld1q_s8((int8_t*)Buffer); #elif defined(GI_SSE2_INTRINSICS) return _mm_loadu_si128((const __m128i*)Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle8_v_i8m1((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t ret; const int8_t* ptr = (int8_t*)Buffer; @@ -71,6 +92,8 @@ void GiStoreInt32(void* Buffer, GI_INT32_t Vector) { vst1q_s32((int32_t*)Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) _mm_storeu_si128((__m128i*)Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse32_v_i32m1((int32_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else int32_t* ptr = (int32_t*)Buffer; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -93,6 +116,14 @@ void GiStoreInt32(void* Buffer, GI_INT32_t Vector) { _mm_store_ss( \ (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ } +#elif defined(GI_RVV_INTRINSICS) + +#define GISTORELANEINT32(i) \ + GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \ + int32_t t[GI_SIMD_LEN_BYTE / sizeof(int32_t)]; \ + vse32_v_i32m1(t, Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); \ + *((int32_t*)Buffer) = t[i]; \ + } #else #define GISTORELANEINT32(i) \ GI_FORCEINLINE void GiStoreLane##i##Int32(void* Buffer, GI_INT32_t Vector) { \ @@ -113,6 +144,8 @@ GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { return vreinterpretq_s8_s32(Vector); #elif defined(GI_SSE2_INTRINSICS) return Vector; +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_i32m1_i8m1(Vector); #else return *(GI_INT8_t*)&Vector; #endif @@ -124,6 +157,8 @@ void GiStoreInt16(void* Buffer, GI_INT16_t Vector) { vst1q_s16((int16_t*)Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) _mm_storeu_si128((__m128i*)Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse16_v_i16m1((int16_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); #else int16_t* ptr = (int16_t*)Buffer; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { @@ -138,6 +173,8 @@ void GiStoreInt8(void* Buffer, GI_INT8_t Vector) { vst1q_s8((int8_t*)Buffer, Vector); #elif defined(GI_SSE2_INTRINSICS) _mm_storeu_si128((__m128i*)Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse8_v_i8m1((int8_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else int8_t* ptr = (int8_t*)Buffer; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { @@ -152,6 +189,8 @@ void GiStoreLowInt8(void* Buffer, GI_INT8_t Vector) { vst1_s8((int8_t*)Buffer, vget_low_s8(Vector)); #elif defined(GI_SSE2_INTRINSICS) _mm_storel_epi64((__m128i*)Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse8_v_i8m1((int8_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2); #else int8_t* ptr = (int8_t*)Buffer; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { @@ -166,6 +205,21 @@ void GiStoreHihgInt8(void* Buffer, GI_INT8_t Vector) { vst1_s8((int8_t*)Buffer, vget_high_s8(Vector)); #elif defined(GI_SSE2_INTRINSICS) _mm_storel_epi64((__m128i*)Buffer, _mm_unpackhi_epi64(Vector, Vector)); +#elif defined(GI_RVV_INTRINSICS) + vuint8m1_t index; +#if GI_SIMD_LEN_BYTE == 16 + uint8_t index_128[16] = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}; + index = vle8_v_u8m1(index_128, 16); +#else + uint8_t* index_p = (uint8_t*)&index; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2; i++) { + index_p[i] = offset + i; + index_p[offset + i] = i; + } +#endif + vint8m1_t g_d = vrgather_vv_i8m1(Vector, index, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vse8_v_i8m1((int8_t*)Buffer, g_d, GI_SIMD_LEN_BYTE / sizeof(int8_t) / 2); #else int8_t* ptr = (int8_t*)Buffer; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); i++) { @@ -181,6 +235,8 @@ GI_INT32_t GiNegInt32(GI_INT32_t Vector) { #elif defined(GI_SSE2_INTRINSICS) GI_INT32_t zero = _mm_set1_epi32(0); return _mm_sub_epi32(zero, Vector); +#elif defined(GI_RVV_INTRINSICS) + return vneg_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return -Vector; #endif @@ -193,6 +249,8 @@ GI_INT8_t GiNegInt8(GI_INT8_t Vector) { #elif defined(GI_SSE2_INTRINSICS) GI_INT32_t zero = _mm_set1_epi8(0); return _mm_sub_epi8(zero, Vector); +#elif defined(GI_RVV_INTRINSICS) + return vneg_v_i8m1(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return -Vector; #endif @@ -209,6 +267,15 @@ GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { res = _mm_and_si128(Vector1, Vector2); res = _mm_cmpeq_epi32(res, zero); return _mm_xor_si128(res, one); +#elif defined(GI_RVV_INTRINSICS) + //! rvv uint32_t mask only use bit 0 and 1, imp with naive + GI_UINT32_FIXLEN_t a = GiUint32Type2FixLenType(Vector1); + GI_UINT32_FIXLEN_t b = GiUint32Type2FixLenType(Vector2); + GI_UINT32_FIXLEN_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = a[i] & b[i] ? 0xFFFFFFFF : 0; + } + return GiFixLenType2GiUint32Type(ret); #else GI_UINT32_t ret; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -224,6 +291,8 @@ GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { return vaddq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vadd_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector1 + Vector2; #endif @@ -235,6 +304,8 @@ GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { return vaddq_u32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vadd_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); #else return Vector1 + Vector2; #endif @@ -246,6 +317,8 @@ GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { return vaddq_s16(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vadd_vv_i16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); #else return Vector1 + Vector2; #endif @@ -257,6 +330,8 @@ GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return vaddq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi8(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vadd_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 + Vector2; #endif @@ -268,6 +343,8 @@ GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { return vsubq_s32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_sub_epi32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vsub_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector1 - Vector2; #endif @@ -279,6 +356,8 @@ GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { return vsubq_u32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_sub_epi32(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vsub_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); #else return Vector1 - Vector2; #endif @@ -290,6 +369,8 @@ GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return vsubq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_sub_epi8(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vsub_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 - Vector2; #endif @@ -303,6 +384,8 @@ GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); +#elif defined(GI_RVV_INTRINSICS) + return vmul_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector1 * Vector2; #endif @@ -320,6 +403,8 @@ GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { res[id] = v1[id] * v2[id]; } return _mm_loadu_si128((__m128i*)res); +#elif defined(GI_RVV_INTRINSICS) + return vmul_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 * Vector2; #endif @@ -332,6 +417,9 @@ GI_INT32_t GiMultiplyAddInt32( return vmlaq_s32(Vector1, Vector2, Vector3); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); +#elif defined(GI_RVV_INTRINSICS) + return vmadd_vv_i32m1( + Vector2, Vector3, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector1 + Vector2 * Vector3; #endif @@ -343,6 +431,8 @@ GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vect return vmlaq_s8(Vector1, Vector2, Vector3); #elif defined(GI_SSE2_INTRINSICS) return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); +#elif defined(GI_RVV_INTRINSICS) + return vmadd_vv_i8m1(Vector2, Vector3, Vector1, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 + Vector2 * Vector3; #endif @@ -354,6 +444,8 @@ GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return vandq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_and_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vand_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 & Vector2; #endif @@ -365,6 +457,8 @@ GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { return veorq_u32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_xor_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vxor_vv_u32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); #else return Vector1 ^ Vector2; #endif @@ -376,6 +470,8 @@ GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return vorrq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_or_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vor_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 | Vector2; #endif @@ -387,6 +483,9 @@ GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { return vandq_s8(vmvnq_s8(VectorNot), Vector); #elif defined(GI_SSE2_INTRINSICS) return _mm_andnot_si128(VectorNot, Vector); +#elif defined(GI_RVV_INTRINSICS) + GI_INT8_t not_v = vnot_v_i8m1(VectorNot, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + return vand_vv_i8m1(not_v, Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t Not = ~VectorNot; return (Not & Vector); @@ -399,6 +498,8 @@ GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return veorq_s8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return _mm_xor_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vxor_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else return Vector1 ^ Vector2; #endif @@ -410,6 +511,8 @@ GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { return vshlq_n_s32(Vector, 23); #elif defined(GI_SSE2_INTRINSICS) return _mm_slli_epi32(Vector, 23); +#elif defined(GI_RVV_INTRINSICS) + return vsll_vx_i32m1(Vector, 23, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector << 23; #endif @@ -421,6 +524,8 @@ GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { return vshrq_n_s32(Vector, 23); #elif defined(GI_SSE2_INTRINSICS) return _mm_srai_epi32(Vector, 23); +#elif defined(GI_RVV_INTRINSICS) + return vsra_vx_i32m1(Vector, 23, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else return Vector >> 23; #endif @@ -442,6 +547,11 @@ GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { return vabsq_s32(Vector); #elif defined(GI_SSE42_INTRINSICS) return _mm_abs_epi32(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! rvv do not have int abs now + GI_INT32_t shift = vsra_vx_i32m1(Vector, 31, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + GI_INT32_t t_add = vadd_vv_i32m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + return vxor_vv_i32m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_INT32_t ret; GI_INT32_NAIVE_t tmp_ret; @@ -463,6 +573,11 @@ GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { return vabsq_s16(Vector); #elif defined(GI_SSE42_INTRINSICS) return _mm_abs_epi16(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! rvv do not have int abs now + GI_INT16_t shift = vsra_vx_i16m1(Vector, 15, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + GI_INT16_t t_add = vadd_vv_i16m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vxor_vv_i16m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int16_t)); #else GI_INT16_t ret; GI_INT16_NAIVE_t tmp_ret; @@ -483,6 +598,11 @@ GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { return vabsq_s8(Vector); #elif defined(GI_SSE42_INTRINSICS) return _mm_abs_epi8(Vector); +#elif defined(GI_RVV_INTRINSICS) + //! rvv do not have int abs now + GI_INT8_t shift = vsra_vx_i8m1(Vector, 7, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + GI_INT8_t t_add = vadd_vv_i8m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + return vxor_vv_i8m1(t_add, shift, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t ret; GI_INT8_NAIVE_t tmp_ret; @@ -505,6 +625,8 @@ GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { return _mm_max_epi32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + return vmax_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_INT32_t tmp; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -522,6 +644,8 @@ GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { return _mm_min_epi32(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return GiBlendInt32(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1)); +#elif defined(GI_RVV_INTRINSICS) + return vmin_vv_i32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int32_t)); #else GI_INT32_t tmp; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { @@ -544,6 +668,8 @@ GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return _mm_max_epi8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector1, Vector2)); +#elif defined(GI_RVV_INTRINSICS) + return vmax_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t tmp; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { @@ -561,6 +687,8 @@ GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { return _mm_min_epi8(Vector1, Vector2); #elif defined(GI_SSE2_INTRINSICS) return GiBlendInt8(Vector2, Vector1, _mm_cmpgt_epi8(Vector2, Vector1)); +#elif defined(GI_RVV_INTRINSICS) + return vmin_vv_i8m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t tmp; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { @@ -584,6 +712,9 @@ GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { data[i] = o_data[8 + i]; } return _mm_loadu_si128((__m128i*)data); +#elif defined(GI_RVV_INTRINSICS) + vint16m2_t two = vwcvt_x_x_v_i16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + return vget_v_i16m2_i16m1(two, 1); #else GI_INT16_t ret; int8_t* data = (int8_t*)&Vector; @@ -609,6 +740,9 @@ GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { data[i] = o_data[i]; } return _mm_loadu_si128((__m128i*)data); +#elif defined(GI_RVV_INTRINSICS) + vint16m2_t two = vwcvt_x_x_v_i16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + return vget_v_i16m2_i16m1(two, 0); #else GI_INT16_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); @@ -633,6 +767,9 @@ GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { data[i] = o_data[4 + i]; } return _mm_loadu_si128((__m128i*)data); +#elif defined(GI_RVV_INTRINSICS) + vint32m2_t two = vwcvt_x_x_v_i32m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vget_v_i32m2_i32m1(two, 1); #else GI_INT32_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); @@ -657,6 +794,9 @@ GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { data[i] = o_data[i]; } return _mm_loadu_si128((__m128i*)data); +#elif defined(GI_RVV_INTRINSICS) + vint32m2_t two = vwcvt_x_x_v_i32m2(Vector, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vget_v_i32m2_i32m1(two, 0); #else GI_INT32_t ret; size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); @@ -703,8 +843,16 @@ int32_t GiReduceAddInt8(GI_INT8_t Vector) { float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 2, 2, 2))); float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3))); return (int16_t)(ret0 + ret1 + ret2 + ret3); +#elif defined(GI_RVV_INTRINSICS) + vint16m1_t redsum = vundefined_i16m1(); + vint16m1_t zero = vmv_v_x_i16m1(0, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + redsum = vwredsum_vs_i8m1_i16m1( + redsum, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + int16_t ret = 0; + vse16_v_i16m1(&ret, redsum, 1); + return ret; #else - int32_t sum = 0; + int16_t sum = 0; for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { sum += Vector[i]; } @@ -751,6 +899,13 @@ int8_t GiReduceMaxInt8(GI_INT8_t Vector) { float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(2, 2, 2, 2))); float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(max, max, _MM_SHUFFLE(3, 3, 3, 3))); return (int8_t)(Max(Max(ret0, ret1), Max(ret2, ret3))); +#elif defined(GI_RVV_INTRINSICS) + vint8m1_t max = vundefined_i8m1(); + vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + max = vredmax_vs_i8m1_i8m1(max, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + int8_t ret = 0; + vse8_v_i8m1(&ret, max, 1); + return ret; #else int8_t max = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { @@ -799,6 +954,13 @@ int8_t GiReduceMinInt8(GI_INT8_t Vector) { float ret2 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(2, 2, 2, 2))); float ret3 = _mm_cvtss_f32(_mm_shuffle_ps(min, min, _MM_SHUFFLE(3, 3, 3, 3))); return (int8_t)(Min(Min(ret0, ret1), Min(ret2, ret3))); +#elif defined(GI_RVV_INTRINSICS) + vint8m1_t min = vundefined_i8m1(); + vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + min = vredmin_vs_i8m1_i8m1(min, Vector, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + int8_t ret = 0; + vse8_v_i8m1(&ret, min, 1); + return ret; #else int8_t min = Vector[0]; for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { @@ -821,21 +983,40 @@ GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); #else - float32x4_t vinc0 = vbslq_f32(vcgeq_f32(src, vfzero), vfhalf, vfneg_half); + float32x4_t vinc0 = vbslq_f32( + vcgeq_f32(src, GiBroadcastFloat32(0.0f)), GiBroadcastFloat32(0.5f), + GiBroadcastFloat32(-0.5f)); int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(src, vinc0)); int16x8_t mid_s16 = vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres0)); return vcombine_s8(vqmovn_s16(mid_s16), vqmovn_s16(mid_s16)); #endif #elif defined(GI_SSE42_INTRINSICS) - __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(src, vfzero)); + __m128 vinc0 = _mm_blendv_ps( + GiBroadcastFloat32(-0.5f), GiBroadcastFloat32(0.5f), + _mm_cmpge_ps(src, GiBroadcastFloat32(0.0f))); __m128 vres0 = _mm_add_ps(src, vinc0); vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8); + vres0 = _mm_min_ps( + _mm_max_ps(vres0, GiBroadcastFloat32(-128.0f)), GiBroadcastFloat32(127.0f)); __m128i vepi32_0 = _mm_cvtps_epi32(vres0); __m128i vepi16 = _mm_packs_epi32(vepi32_0, vepi32_0); __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); return vepi8; +#elif defined(GI_RVV_INTRINSICS) + //! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 + //! as a workaround, we imp this API by naive + GI_INT8_NAIVE_t tmp_ret; + GI_FLOAT32_FIXLEN_t s0 = GiFloat32Type2FixLenType(src); + int length = GI_SIMD_LEN_BYTE / sizeof(float); + for (int i = 0; i < length; i++) { + int8_t data = Saturate(round(s0[i]), -128, 127); + tmp_ret[i] = data; + tmp_ret[length + i] = data; + tmp_ret[2 * length + i] = data; + tmp_ret[3 * length + i] = data; + } + return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t ret; GI_INT8_NAIVE_t tmp_ret; @@ -863,16 +1044,25 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); return vcombine_s8(mid1, mid1); #else - float32x4_t vinc0 = vbslq_f32(vcgeq_f32(vsrc.val[0], vfzero), vfhalf, vfneg_half); - float32x4_t vinc1 = vbslq_f32(vcgeq_f32(vsrc.val[1], vfzero), vfhalf, vfneg_half); + GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); + GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); + float32x4_t vinc0 = vbslq_f32( + vcgeq_f32(vsrc.val[0], GiBroadcastFloat32(0.0f)), vfhalf, vfneg_half); + float32x4_t vinc1 = vbslq_f32( + vcgeq_f32(vsrc.val[1], GiBroadcastFloat32(0.0f)), vfhalf, vfneg_half); int32x4_t vres0 = vcvtq_s32_f32(vaddq_f32(vsrc.val[0], vinc0)); int32x4_t vres1 = vcvtq_s32_f32(vaddq_f32(vsrc.val[1], vinc1)); int8x8_t mid1 = vqmovn_s16(vcombine_s16(vqmovn_s32(vres0), vqmovn_s32(vres1))); return vcombine_s8(mid1, mid1); #endif #elif defined(GI_SSE42_INTRINSICS) - __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero)); - __m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero)); + GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); + GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); + GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); + __m128 vinc0 = _mm_blendv_ps( + vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], GiBroadcastFloat32(0.0f))); + __m128 vinc1 = _mm_blendv_ps( + vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], GiBroadcastFloat32(0.0f))); __m128 vres0 = _mm_add_ps(vsrc.val[0], vinc0); __m128 vres1 = _mm_add_ps(vsrc.val[1], vinc1); @@ -880,14 +1070,26 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { vres0 = _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); vres1 = _mm_round_ps(vres1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - vres0 = _mm_min_ps(_mm_max_ps(vres0, vfmin_int8), vfmax_int8); - vres1 = _mm_min_ps(_mm_max_ps(vres1, vfmin_int8), vfmax_int8); + vres0 = _mm_min_ps(_mm_max_ps(vres0, GiBroadcastFloat32(-128.0f)), vfmax_int8); + vres1 = _mm_min_ps(_mm_max_ps(vres1, GiBroadcastFloat32(-128.0f)), vfmax_int8); __m128i vepi32_0 = _mm_cvtps_epi32(vres0); __m128i vepi32_1 = _mm_cvtps_epi32(vres1); __m128i vepi16_0 = _mm_packs_epi32(vepi32_0, vepi32_1); __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); return vepi8; +#elif defined(GI_RVV_INTRINSICS) + //! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 + //! as a workaround, we imp this API by naive + GI_INT8_NAIVE_t tmp_ret; + GI_FLOAT32_FIXLEN_V2_t s0 = GiFloat32Type2FixLenV2Type(vsrc); + int length = GI_SIMD_LEN_BYTE / sizeof(float); + for (int i = 0; i < 2 * length; i++) { + int8_t data = Saturate(round(s0.val[i / length][i % length]), -128, 127); + tmp_ret[i] = data; + tmp_ret[i + length * 2] = data; + } + return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t ret; GI_INT8_NAIVE_t tmp_ret; @@ -932,6 +1134,11 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { return vcombine_s8(mid1, mid2); #endif #elif defined(GI_SSE42_INTRINSICS) + GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); + GI_FLOAT32_t vfhalf = GiBroadcastFloat32(0.5f); + GI_FLOAT32_t vfneg_half = GiBroadcastFloat32(-0.5f); + GI_FLOAT32_t vfmin_int8 = GiBroadcastFloat32(-128.0f); + GI_FLOAT32_t vfmax_int8 = GiBroadcastFloat32(127.0f); __m128 vinc0 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[0], vfzero)); __m128 vinc1 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[1], vfzero)); __m128 vinc2 = _mm_blendv_ps(vfneg_half, vfhalf, _mm_cmpge_ps(vsrc.val[2], vfzero)); @@ -960,6 +1167,20 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { __m128i vepi16_1 = _mm_packs_epi32(vepi32_2, vepi32_3); __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); return vepi8; +#elif defined(GI_RVV_INTRINSICS) + //! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7 + //! as a workaround, we imp this API by naive + GI_INT8_NAIVE_t tmp_ret; + GI_FLOAT32_V4_NAIVE_t s0; + s0.val[0] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 0)); + s0.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 1)); + s0.val[2] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 2)); + s0.val[3] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 3)); + int length = GI_SIMD_LEN_BYTE / sizeof(float); + for (int i = 0; i < 4 * length; i++) { + tmp_ret[i] = Saturate(round(s0.val[i / length][i % length]), -128, 127); + } + return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else GI_INT8_t ret; GI_INT8_NAIVE_t tmp_ret; diff --git a/dnn/test/fallback/gi.cpp b/dnn/test/fallback/gi.cpp index 3dd1ed25..d5b629f7 100644 --- a/dnn/test/fallback/gi.cpp +++ b/dnn/test/fallback/gi.cpp @@ -1,5 +1,11 @@ +#include #include +#if defined(ONLY_BUILD_GI_API) +#include +class FALLBACK : public ::testing::Test {}; +#else #include "test/fallback/fixture.h" +#endif #include "src/fallback/general_intrinsic/gi_float.h" #include "src/fallback/general_intrinsic/gi_int.h" @@ -44,6 +50,10 @@ static void assert_lt( } } +static void force_memset_ret(void* dst, const size_t len) { + memset(dst, 'f', len); +} + TEST_F(FALLBACK, GiGetSimdType) { auto t = GiGetSimdType(); auto should_type = GI_UNKNOWN; @@ -63,6 +73,9 @@ TEST_F(FALLBACK, GiGetSimdType) { #error "code issue happened!!" #endif +#elif defined(GI_RVV_INTRINSICS) + should_type = GI_RVV; + #else should_type = GI_NAIVE; #endif @@ -72,6 +85,801 @@ TEST_F(FALLBACK, GiGetSimdType) { ASSERT_EQ(t, should_type); } +TEST_F(FALLBACK, GiReinterpretInt8AsInt32) { + GI_INT32_t ret; + GI_INT8_t src0; + std::vector s0{9, 2, -128, 127, 2, 45, 3, 0, + 11, 2, -128, 127, 2, 55, 3, -1}; + s0.resize(SIMD_LEN_8); + init((int8_t*)&src0, s0, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiReinterpretInt8AsInt32(src0); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s0[i], sizeof(int8_t)); + naive.push_back(tmp); + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiGetSubVectorFloat32V2) { + GI_FLOAT32_V2_t src0; + GI_FLOAT32_t ret1, ret2; + std::vector s0{ + -1.0f, 2.2f, -3.4f, 4.5f, 111.0f, 12.2f, -13.4f, -44.5f, + }; + s0.resize(SIMD_LEN * 2); + init((float*)&src0, s0, SIMD_LEN * 2); + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorFloat32V2(src0, 0); + ret2 = GiGetSubVectorFloat32V2(src0, 1); + + std::vector naive1, naive2; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN], sizeof(float)); + naive2.push_back(tmp); + } + + assert_eq((float*)&ret1, naive1, SIMD_LEN); + assert_eq((float*)&ret2, naive2, SIMD_LEN); +} + +TEST_F(FALLBACK, GiSetSubVectorFloat32V2) { + GI_FLOAT32_V2_t ret; + GI_FLOAT32_t src0, src1; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f}; + std::vector s1{111.0f, 12.2f, -13.4f, -44.5f}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + init((float*)&src0, s0, SIMD_LEN); + init((float*)&src1, s1, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + GiSetSubVectorFloat32V2(ret, 0, src0); + GiSetSubVectorFloat32V2(ret, 1, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s1[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 2); +} + +TEST_F(FALLBACK, GiFloat32Type2FixLenType) { + GI_FLOAT32_FIXLEN_t ret; + GI_FLOAT32_t src; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f}; + s0.resize(SIMD_LEN); + init((float*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFloat32Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiFixLenType2GiFloat32Type) { + GI_FLOAT32_t ret; + GI_FLOAT32_FIXLEN_t src; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f}; + s0.resize(SIMD_LEN); + init((float*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiFloat32Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiFloat32Type2FixLenV2Type) { + GI_FLOAT32_FIXLEN_V2_t ret; + GI_FLOAT32_V2_t src; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f, 55.1f, 99.0f, -1.9f, -5.3f}; + s0.resize(SIMD_LEN * 2); + init((float*)&src, s0, SIMD_LEN * 2); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); + ret = GiFloat32Type2FixLenV2Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN * 2; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 2); +} + +TEST_F(FALLBACK, GiFixLenType2GiFloat32V2Type) { + GI_FLOAT32_V2_t ret; + GI_FLOAT32_FIXLEN_V2_t src; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f, 111.0f, 12.2f, -13.4f, -44.5f}; + s0.resize(SIMD_LEN * 2); + init((float*)&src, s0, SIMD_LEN * 2); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); + ret = GiFixLenType2GiFloat32V2Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN * 2; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 2); +} + +TEST_F(FALLBACK, GiGetSubVectorFloat32V3) { + GI_FLOAT32_V3_t src0; + GI_FLOAT32_t ret1, ret2, ret3; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f, 111.0f, 12.2f, + -13.4f, -44.5f, 22.4f, 55.0f, -12.0f, 678.9f}; + s0.resize(SIMD_LEN * 3); + //! rvv compiler crash when use init on type_x3, use rvv load api as a workaround +#if defined(GI_RVV_INTRINSICS) + vfloat32m1_t t00, t10, t20; + t00 = vle32_v_f32m1(s0.data(), SIMD_LEN); + t10 = vle32_v_f32m1(s0.data() + SIMD_LEN, 4); + t20 = vle32_v_f32m1(s0.data() + SIMD_LEN * 2, 4); + src0 = vcreate_f32m1x3(t00, t10, t20); +#else + init((float*)&src0, s0, SIMD_LEN * 3); +#endif + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret3, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorFloat32V3(src0, 0); + ret2 = GiGetSubVectorFloat32V3(src0, 1); + ret3 = GiGetSubVectorFloat32V3(src0, 2); + + std::vector naive1, naive2, naive3; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN], sizeof(float)); + naive2.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN * 2], sizeof(float)); + naive3.push_back(tmp); + } + + assert_eq((float*)&ret1, naive1, SIMD_LEN); + assert_eq((float*)&ret2, naive2, SIMD_LEN); + assert_eq((float*)&ret3, naive3, SIMD_LEN); +} + +TEST_F(FALLBACK, GiSetSubVectorFloat32V3) { + GI_FLOAT32_V3_t ret; + GI_FLOAT32_t src0, src1, src2; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f}; + std::vector s1{111.0f, 12.2f, -13.4f, -44.5f}; + std::vector s2{22.4f, 55.0f, -12.0f, 678.9f}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + s2.resize(SIMD_LEN); + init((float*)&src0, s0, SIMD_LEN); + init((float*)&src1, s1, SIMD_LEN); + init((float*)&src2, s2, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 3); + GiSetSubVectorFloat32V3(ret, 0, src0); + GiSetSubVectorFloat32V3(ret, 1, src1); + GiSetSubVectorFloat32V3(ret, 2, src2); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s1[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s2[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 3); +} + +TEST_F(FALLBACK, GiGetSubVectorFloat32V4) { + GI_FLOAT32_V4_t src0; + GI_FLOAT32_t ret1, ret2, ret3, ret4; + std::vector s0{-1.0f, 2.2f, -3.4f, 4.5f, 111.0f, 12.2f, -13.4f, -44.5f, + 22.4f, 55.0f, -12.0f, 678.9f, 2.2f, -3.4f, 4.5f, 111.0f}; + s0.resize(SIMD_LEN * 4); +#if defined(GI_RVV_INTRINSICS) + vfloat32m1_t t00, t10, t20, t30; + t00 = vle32_v_f32m1(s0.data(), SIMD_LEN); + t10 = vle32_v_f32m1(s0.data() + SIMD_LEN, 4); + t20 = vle32_v_f32m1(s0.data() + SIMD_LEN * 2, 4); + t30 = vle32_v_f32m1(s0.data() + SIMD_LEN * 3, 4); + src0 = vcreate_f32m1x4(t00, t10, t20, t30); +#else + init((float*)&src0, s0, SIMD_LEN * 4); +#endif + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret3, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret4, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorFloat32V4(src0, 0); + ret2 = GiGetSubVectorFloat32V4(src0, 1); + ret3 = GiGetSubVectorFloat32V4(src0, 2); + ret4 = GiGetSubVectorFloat32V4(src0, 3); + + std::vector naive1, naive2, naive3, naive4; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN], sizeof(float)); + naive2.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN * 2], sizeof(float)); + naive3.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN * 3], sizeof(float)); + naive4.push_back(tmp); + } + + assert_eq((float*)&ret1, naive1, SIMD_LEN); + assert_eq((float*)&ret2, naive2, SIMD_LEN); + assert_eq((float*)&ret3, naive3, SIMD_LEN); + assert_eq((float*)&ret4, naive4, SIMD_LEN); +} + +TEST_F(FALLBACK, GiSetSubVectorFloat32V4) { + GI_FLOAT32_V4_t ret; + GI_FLOAT32_t src0, src1, src2, src3; + std::vector s0{-1.0f, 2.2f, -3.4f, 99.0f}; + std::vector s1{4.5f, 111.0f, 12.2f, -13.4f}; + std::vector s2{-44.5f, 22.4f, 55.0f, -12.0f}; + std::vector s3{2.2f, -3.4f, 4.5f, 111.0f}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + s2.resize(SIMD_LEN); + s3.resize(SIMD_LEN); + init((float*)&src0, s0, SIMD_LEN); + init((float*)&src1, s1, SIMD_LEN); + init((float*)&src2, s2, SIMD_LEN); + init((float*)&src3, s3, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 4); + GiSetSubVectorFloat32V4(ret, 0, src0); + GiSetSubVectorFloat32V4(ret, 1, src1); + GiSetSubVectorFloat32V4(ret, 2, src2); + GiSetSubVectorFloat32V4(ret, 3, src3); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s0[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s1[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s2[i], sizeof(float)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + float tmp; + memcpy(&tmp, &s3[i], sizeof(float)); + naive.push_back(tmp); + } + + assert_eq((float*)&ret, naive, SIMD_LEN * 4); +} + +TEST_F(FALLBACK, GiGetSubVectorInt32V2) { + GI_INT32_V2_t src0; + GI_INT32_t ret1, ret2; + std::vector s0{1, 2, 3, 4, -4, -3, -2, -1}; + s0.resize(SIMD_LEN * 2); + init((int32_t*)&src0, s0, SIMD_LEN * 2); + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorInt32V2(src0, 0); + ret2 = GiGetSubVectorInt32V2(src0, 1); + + std::vector naive1, naive2; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN], sizeof(int32_t)); + naive2.push_back(tmp); + } + + assert_eq((int32_t*)&ret1, naive1, SIMD_LEN); + assert_eq((int32_t*)&ret2, naive2, SIMD_LEN); +} + +TEST_F(FALLBACK, GiSetSubVectorInt32V2) { + GI_INT32_V2_t ret; + GI_INT32_t src0, src1; + std::vector s0{1, 2, 3, 4}; + std::vector s1{-4, -3, -2, -1}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + init((int32_t*)&src0, s0, SIMD_LEN); + init((int32_t*)&src1, s1, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); + GiSetSubVectorInt32V2(ret, 0, src0); + GiSetSubVectorInt32V2(ret, 1, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s1[i], sizeof(int32_t)); + naive.push_back(tmp); + } + + assert_eq((int32_t*)&ret, naive, SIMD_LEN * 2); +} + +TEST_F(FALLBACK, GiInt32Type2FixLenType) { + GI_INT32_FIXLEN_t ret; + GI_INT32_t src; + std::vector s0{3, 4, -4, -3}; + s0.resize(SIMD_LEN); + init((int32_t*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiInt32Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive.push_back(tmp); + } + + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiFixLenType2GiInt32Type) { + GI_INT32_t ret; + GI_INT32_FIXLEN_t src; + std::vector s0{2, 3, 4, -4}; + s0.resize(SIMD_LEN); + init((int32_t*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiInt32Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive.push_back(tmp); + } + + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiUint32Type2FixLenType) { + GI_UINT32_FIXLEN_t ret; + GI_UINT32_t src; + std::vector s0{1, 2, 3, 4}; + s0.resize(SIMD_LEN); + init((uint32_t*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiUint32Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + uint32_t tmp; + memcpy(&tmp, &s0[i], sizeof(uint32_t)); + naive.push_back(tmp); + } + + assert_eq((uint32_t*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiFixLenType2GiUint32Type) { + GI_UINT32_t ret; + GI_UINT32_FIXLEN_t src; + std::vector s0{1, 2, 3, 4}; + s0.resize(SIMD_LEN); + init((uint32_t*)&src, s0, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiUint32Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + uint32_t tmp; + memcpy(&tmp, &s0[i], sizeof(uint32_t)); + naive.push_back(tmp); + } + + assert_eq((uint32_t*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiGetSubVectorInt32V4) { + GI_INT32_V4_t src0; + GI_INT32_t ret1, ret2, ret3, ret4; + std::vector s0{1, 2, 3, 4, -4, -3, -2, -1, + 23, 456, 765, -99, 45, 99, 0, 8}; + s0.resize(SIMD_LEN * 4); + init((int32_t*)&src0, s0, SIMD_LEN * 4); + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret3, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret4, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorInt32V4(src0, 0); + ret2 = GiGetSubVectorInt32V4(src0, 1); + ret3 = GiGetSubVectorInt32V4(src0, 2); + ret4 = GiGetSubVectorInt32V4(src0, 3); + + std::vector naive1, naive2, naive3, naive4; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN], sizeof(int32_t)); + naive2.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN * 2], sizeof(int32_t)); + naive3.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN * 3], sizeof(int32_t)); + naive4.push_back(tmp); + } + + assert_eq((int32_t*)&ret1, naive1, SIMD_LEN); + assert_eq((int32_t*)&ret2, naive2, SIMD_LEN); + assert_eq((int32_t*)&ret3, naive3, SIMD_LEN); + assert_eq((int32_t*)&ret4, naive4, SIMD_LEN); +} + +TEST_F(FALLBACK, GiSetSubVectorInt32V4) { + GI_INT32_V4_t ret; + GI_INT32_t src0, src1, src2, src3; + std::vector s0{1, 2, 3, 4, -4}; + std::vector s1{3, -2, -1, 23}; + std::vector s2{456, 765, -99, 45}; + std::vector s3{45, 99, 0, 8}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + s2.resize(SIMD_LEN); + s3.resize(SIMD_LEN); + init((int32_t*)&src0, s0, SIMD_LEN); + init((int32_t*)&src1, s1, SIMD_LEN); + init((int32_t*)&src2, s2, SIMD_LEN); + init((int32_t*)&src3, s3, SIMD_LEN); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 4); + GiSetSubVectorInt32V4(ret, 0, src0); + GiSetSubVectorInt32V4(ret, 1, src1); + GiSetSubVectorInt32V4(ret, 2, src2); + GiSetSubVectorInt32V4(ret, 3, src3); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s0[i], sizeof(int32_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s1[i], sizeof(int32_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s2[i], sizeof(int32_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + int32_t tmp; + memcpy(&tmp, &s3[i], sizeof(int32_t)); + naive.push_back(tmp); + } + + assert_eq((int32_t*)&ret, naive, SIMD_LEN * 4); +} + +TEST_F(FALLBACK, GiGetSubVectorInt16V2) { + GI_INT16_V2_t src0; + GI_INT16_t ret1, ret2; + std::vector s0{-127, 2, std::numeric_limits::max(), + 9999, 1, 2, + 3, 4, 1, + 2, 3, 4, + -4, -3, -2, + -1}; + s0.resize(SIMD_LEN_16 * 2); + init((int16_t*)&src0, s0, SIMD_LEN_16 * 2); + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorInt16V2(src0, 0); + ret2 = GiGetSubVectorInt16V2(src0, 1); + + std::vector naive1, naive2; + for (size_t i = 0; i < SIMD_LEN_16; i++) { + int16_t tmp; + memcpy(&tmp, &s0[i], sizeof(int16_t)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN_16], sizeof(int16_t)); + naive2.push_back(tmp); + } + + assert_eq((int16_t*)&ret1, naive1, SIMD_LEN_16); + assert_eq((int16_t*)&ret2, naive2, SIMD_LEN_16); +} + +TEST_F(FALLBACK, GiSetSubVectorInt16V2) { + GI_INT16_V2_t ret; + GI_INT16_t src0, src1; + std::vector s0{-127, 2, std::numeric_limits::max(), 9999, 1, 2, + 3, 4}; + std::vector s1{1, 2, 3, 4, -4, -3, -2, -1}; + s0.resize(SIMD_LEN_16); + s1.resize(SIMD_LEN_16); + init((int16_t*)&src0, s0, SIMD_LEN_16); + init((int16_t*)&src1, s1, SIMD_LEN_16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); + GiSetSubVectorInt16V2(ret, 0, src0); + GiSetSubVectorInt16V2(ret, 1, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_16; i++) { + int16_t tmp; + memcpy(&tmp, &s0[i], sizeof(int16_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN_16; i++) { + int16_t tmp; + memcpy(&tmp, &s1[i], sizeof(int16_t)); + naive.push_back(tmp); + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16 * 2); +} + +TEST_F(FALLBACK, GiInt16Type2FixLenType) { + GI_INT16_t src; + GI_INT16_FIXLEN_t ret; + std::vector s0{-127, 2, std::numeric_limits::max(), 9999, 1, 2, + 3, 4}; + s0.resize(SIMD_LEN_16); + init((int16_t*)&src, s0, SIMD_LEN_16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiInt16Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_16; i++) { + int16_t tmp; + memcpy(&tmp, &s0[i], sizeof(int16_t)); + naive.push_back(tmp); + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} + +TEST_F(FALLBACK, GiFixLenType2GiInt16Type) { + GI_INT16_FIXLEN_t src; + GI_INT16_t ret; + std::vector s0{-127, 2, std::numeric_limits::max(), 9999, 1, 2, + 3, 4}; + s0.resize(SIMD_LEN_16); + init((int16_t*)&src, s0, SIMD_LEN_16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiInt16Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_16; i++) { + int16_t tmp; + memcpy(&tmp, &s0[i], sizeof(int16_t)); + naive.push_back(tmp); + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} + +TEST_F(FALLBACK, GiGetSubVectorInt8V2) { + GI_INT8_V2_t src0; + GI_INT8_t ret1, ret2; + std::vector s0{127, 2, 56, -128, 1, 2, 3, 4, 127, 2, 56, + -128, 1, 2, 3, 4, 127, 2, 56, -128, -14, -22, + 3, -4, 127, -22, 56, -128, -1, 2, -3, 44}; + s0.resize(SIMD_LEN_8 * 2); + init((int8_t*)&src0, s0, SIMD_LEN_8 * 2); + + force_memset_ret((void*)&ret1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret2, GI_SIMD_LEN_BYTE); + ret1 = GiGetSubVectorInt8V2(src0, 0); + ret2 = GiGetSubVectorInt8V2(src0, 1); + + std::vector naive1, naive2; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s0[i], sizeof(int8_t)); + naive1.push_back(tmp); + + memcpy(&tmp, &s0[i + SIMD_LEN_8], sizeof(int8_t)); + naive2.push_back(tmp); + } + + assert_eq((int8_t*)&ret1, naive1, SIMD_LEN_8); + assert_eq((int8_t*)&ret2, naive2, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiSetSubVectorInt8V2) { + GI_INT8_V2_t ret; + GI_INT8_t src0, src1; + std::vector s0{127, 2, 56, -128, 1, 2, 3, 4, 127, 2, 56, -128, 1, 2, 3, 4}; + std::vector s1{127, 2, 56, -128, -14, -22, 3, -4, + 127, -22, 56, -128, -1, 2, -3, 44}; + s0.resize(SIMD_LEN_8); + s1.resize(SIMD_LEN_8); + init((int8_t*)&src0, s0, SIMD_LEN_8); + init((int8_t*)&src1, s1, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); + GiSetSubVectorInt8V2(ret, 0, src0); + GiSetSubVectorInt8V2(ret, 1, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s0[i], sizeof(int8_t)); + naive.push_back(tmp); + } + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s1[i], sizeof(int8_t)); + naive.push_back(tmp); + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8 * 2); +} + +TEST_F(FALLBACK, GiUint8Type2FixLenType) { + GI_UINT8_FIXLEN_t ret; + GI_UINT8_t src; + std::vector s0{127, 2, 56, 255, 1, 2, 3, 4, 127, 2, 56, 0, 1, 2, 3, 4}; + s0.resize(SIMD_LEN_8); + init((uint8_t*)&src, s0, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiUint8Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + uint8_t tmp; + memcpy(&tmp, &s0[i], sizeof(uint8_t)); + naive.push_back(tmp); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiFixLenType2GiUint8Type) { + GI_UINT8_t ret; + GI_UINT8_FIXLEN_t src; + std::vector s0{127, 2, 56, 255, 1, 2, 3, 4, 127, 2, 56, 0, 1, 2, 3, 4}; + s0.resize(SIMD_LEN_8); + init((uint8_t*)&src, s0, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiUint8Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + uint8_t tmp; + memcpy(&tmp, &s0[i], sizeof(uint8_t)); + naive.push_back(tmp); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiInt8Type2FixLenType) { + GI_INT8_FIXLEN_t ret; + GI_INT8_t src; + std::vector s0{127, 2, 56, -128, 1, 2, 3, 4, 127, 2, 56, 0, 1, 2, 3, 4}; + s0.resize(SIMD_LEN_8); + init((int8_t*)&src, s0, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiInt8Type2FixLenType(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s0[i], sizeof(int8_t)); + naive.push_back(tmp); + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiFixLenType2GiInt8Type) { + GI_INT8_t ret; + GI_INT8_FIXLEN_t src; + std::vector s0{127, 2, 56, -128, 1, 2, 3, 4, 127, 2, 56, 0, 1, 2, 3, 4}; + s0.resize(SIMD_LEN_8); + init((int8_t*)&src, s0, SIMD_LEN_8); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiFixLenType2GiInt8Type(src); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + int8_t tmp; + memcpy(&tmp, &s0[i], sizeof(int8_t)); + naive.push_back(tmp); + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} + TEST_F(FALLBACK, GiAndInt32) { GI_INT32_t src0, src1, ret; std::vector s0{1, 2, 3, 4}; @@ -81,6 +889,7 @@ TEST_F(FALLBACK, GiAndInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndInt32(src0, src1); std::vector naive; @@ -100,6 +909,7 @@ TEST_F(FALLBACK, GiOrInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiOrInt32(src0, src1); std::vector naive; @@ -119,6 +929,7 @@ TEST_F(FALLBACK, GiAndNotInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndNotInt32(src0, src1); std::vector naive; @@ -138,6 +949,7 @@ TEST_F(FALLBACK, GiXorInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiXorInt32(src0, src1); std::vector naive; @@ -152,6 +964,7 @@ TEST_F(FALLBACK, GiBroadcastFloat32) { GI_FLOAT32_t ret; float b = 2022.0420; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBroadcastFloat32(b); std::vector naive; @@ -166,6 +979,7 @@ TEST_F(FALLBACK, GiBroadcastInt32) { GI_INT32_t ret; int32_t b = 20220420; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBroadcastInt32(b); std::vector naive; @@ -176,6 +990,21 @@ TEST_F(FALLBACK, GiBroadcastInt32) { assert_eq((int32_t*)&ret, naive); } +TEST_F(FALLBACK, GiBroadcastInt8) { + GI_INT8_t ret; + int8_t b = 6; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiBroadcastInt8(b); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(b); + } + + assert_eq((int8_t*)&ret, naive); +} + TEST_F(FALLBACK, GiReinterpretAsInt32) { GI_INT32_t ret; GI_FLOAT32_t src0; @@ -183,6 +1012,7 @@ TEST_F(FALLBACK, GiReinterpretAsInt32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReinterpretAsInt32(src0); std::vector naive; @@ -202,6 +1032,7 @@ TEST_F(FALLBACK, GiReinterpretAsUint32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReinterpretAsUint32(src0); std::vector naive; @@ -221,6 +1052,7 @@ TEST_F(FALLBACK, GiReintInt32ToFloat32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReintInt32ToFloat32(src0); std::vector naive; @@ -240,6 +1072,7 @@ TEST_F(FALLBACK, GiReintUint32ToFloat32) { s0.resize(SIMD_LEN); init((uint32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReintUint32ToFloat32(src0); std::vector naive; @@ -255,10 +1088,11 @@ TEST_F(FALLBACK, GiReintUint32ToFloat32) { TEST_F(FALLBACK, GiRoundAsInt32) { GI_FLOAT32_t src0; GI_INT32_t ret; - std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; + std::vector s0{1.1f, 2.2f, 3.5f, -4.9f}; s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiRoundAsInt32(src0); std::vector naive; @@ -276,6 +1110,7 @@ TEST_F(FALLBACK, GiCastToInt32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCastToInt32(src0); std::vector naive; @@ -293,6 +1128,7 @@ TEST_F(FALLBACK, GiCastToFloat32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCastToFloat32(src0); std::vector naive; @@ -307,6 +1143,7 @@ TEST_F(FALLBACK, GiLoadBroadcastFloat32) { GI_FLOAT32_t ret; float p = 2022.0420; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadBroadcastFloat32(&p); std::vector naive; @@ -319,9 +1156,10 @@ TEST_F(FALLBACK, GiLoadBroadcastFloat32) { TEST_F(FALLBACK, GiZeroFloat32) { GI_FLOAT32_t ret; - memset(&ret, 'f', sizeof(GI_FLOAT32_t)); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); float p = 0; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiZeroFloat32(); std::vector naive; @@ -337,6 +1175,7 @@ TEST_F(FALLBACK, GiLoadFloat32) { std::vector s0{2.3f, 4.7f, -1.4f, 1223.6f}; s0.resize(SIMD_LEN); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadFloat32(s0.data()); std::vector naive; @@ -352,6 +1191,7 @@ TEST_F(FALLBACK, GiLoadFloat32V2) { std::vector s0{2.3f, 4.7f, -1.4f, 1223.6f, 1.1f, 4.0f, 99.7f, 1234.9f}; s0.resize(SIMD_LEN * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); ret = GiLoadFloat32V2(s0.data()); std::vector naive; @@ -367,6 +1207,7 @@ TEST_F(FALLBACK, GiLoadFloat32LowHalf) { std::vector s0{2.3f, 4.7f, -1.4f, 1223.6f}; s0.resize(SIMD_LEN); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadFloat32LowHalf(s0.data()); std::vector naive; @@ -393,6 +1234,7 @@ TEST_F(FALLBACK, GiMlaqFloat32) { init((float*)&src1, s1); init((float*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMlaqFloat32(src0, src1, src2); std::vector naive; @@ -413,27 +1255,27 @@ TEST_F(FALLBACK, GiUzpqFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); ret = GiUzpqFloat32(src0, src1); - std::vector naive0; - std::vector naive1; - naive0.push_back(s0[0]); - naive0.push_back(s0[2]); - naive0.push_back(s1[0]); - naive0.push_back(s1[2]); - naive1.push_back(s0[1]); - naive1.push_back(s0[3]); - naive1.push_back(s1[1]); - naive1.push_back(s1[3]); + std::vector naive; + naive.push_back(s0[0]); + naive.push_back(s0[2]); + naive.push_back(s1[0]); + naive.push_back(s1[2]); + naive.push_back(s0[1]); + naive.push_back(s0[3]); + naive.push_back(s1[1]); + naive.push_back(s1[3]); - assert_eq((float*)&ret, naive0); - assert_eq((float*)&ret + SIMD_LEN, naive1); + assert_eq((float*)&ret, naive, SIMD_LEN * 2); } TEST_F(FALLBACK, GiDupFloat32) { float32x2_t ret; float t = 3.1415; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiDupFloat32(t); auto r = (float*)&ret; @@ -445,6 +1287,7 @@ TEST_F(FALLBACK, GiLdFloat32) { float32x2_t ret; std::vector s0{1.1f, -3.1415f}; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiLdFloat32(s0.data()); auto r = (float*)&ret; @@ -456,9 +1299,10 @@ TEST_F(FALLBACK, GiAddDFloat32) { float32x2_t src0, src1, ret; std::vector s0{1.1f, -3.1415f}; std::vector s1{2.3f, 3.14777f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); - memcpy(&src1, s1.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); + memcpy(&src1, s1.data(), sizeof(float) * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiAddDFloat32(src0, src1); auto r = (float*)&ret; @@ -472,11 +1316,13 @@ TEST_F(FALLBACK, GiAddDFloat32) { TEST_F(FALLBACK, GiGetLaneFloat32) { float32x2_t src0; std::vector s0{1.1f, -3.1415f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); - auto ret = GiGetLaneFloat32(src0, 0); + float ret = 0; + ret = GiGetLaneFloat32(src0, 0); ASSERT_EQ(ret, s0[0]); + ret = 0; ret = GiGetLaneFloat32(src0, 1); ASSERT_EQ(ret, s0[1]); } @@ -484,14 +1330,16 @@ TEST_F(FALLBACK, GiGetLaneFloat32) { TEST_F(FALLBACK, GiSetLaneFloat32) { float32x2_t src0, ret; std::vector s0{2.1f, -3.1415f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); float p = 2022.0420; auto r = (float*)&ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiSetLaneFloat32(p, src0, 0); ASSERT_EQ(*r, p); ASSERT_EQ(*(r + 1), s0[1]); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiSetLaneFloat32(p, src0, 1); ASSERT_EQ(*r, s0[0]); ASSERT_EQ(*(r + 1), p); @@ -500,7 +1348,7 @@ TEST_F(FALLBACK, GiSetLaneFloat32) { TEST_F(FALLBACK, GiSt1Float32) { float32x2_t src0; std::vector s0{2.1f, -3.1415f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); std::vector ret{0, 0}; GiSt1Float32(ret.data(), src0); @@ -512,6 +1360,7 @@ TEST_F(FALLBACK, GiLd2qFloat32) { GI_FLOAT32_V2_t ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f}; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); ret = GiLd2qFloat32(s0.data()); std::vector naive0; @@ -551,8 +1400,9 @@ TEST_F(FALLBACK, GiExtqFloat32) { assert_eq((float*)&ret, naive); }; -#define CB(n) \ - ret = GiExtqFloat32(src0, src1, n); \ +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiExtqFloat32(src0, src1, n); \ compare(n); CB(0) @@ -574,6 +1424,7 @@ TEST_F(FALLBACK, GiMultiplySubFloat32) { init((float*)&src1, s1); init((float*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplySubFloat32(src0, src1, src2); std::vector naive; for (size_t i = 0; i < SIMD_LEN; i++) { @@ -593,13 +1444,14 @@ TEST_F(FALLBACK, GiLd1qLaneFloat32) { float buffer = 3.14159; auto compare = [&](const size_t n) { - memcpy(naive.data(), s0.data(), sizeof(GI_FLOAT32_t)); + memcpy(naive.data(), s0.data(), GI_SIMD_LEN_BYTE); naive[n] = buffer; assert_eq((float*)&ret, naive); }; -#define CB(n) \ - ret = GiLd1qLaneFloat32(&buffer, src0, n); \ +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiLd1qLaneFloat32(&buffer, src0, n); \ compare(n); CB(0) @@ -619,13 +1471,14 @@ TEST_F(FALLBACK, GiSetqLaneFloat32) { float buffer = 6.14159; auto compare = [&](const size_t n) { - memcpy(naive.data(), s0.data(), sizeof(GI_FLOAT32_t)); + memcpy(naive.data(), s0.data(), GI_SIMD_LEN_BYTE); naive[n] = buffer; assert_eq((float*)&ret, naive); }; -#define CB(n) \ - ret = GiSetqLaneFloat32(buffer, src0, n); \ +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiSetqLaneFloat32(buffer, src0, n); \ compare(n); CB(0) @@ -656,6 +1509,7 @@ TEST_F(FALLBACK, GiMlaqLaneFloat32HighHalf) { }; #define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ ret = GiMlaqLaneFloat32HighHalf(src0, src1, src2, n); \ compare(n); @@ -685,6 +1539,7 @@ TEST_F(FALLBACK, GiVmlaqLaneFloat32LowHalf) { }; #define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ ret = GiVmlaqLaneFloat32LowHalf(src0, src1, src2, n); \ compare(n); @@ -763,21 +1618,20 @@ TEST_F(FALLBACK, GiZipqFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 2); ret = GiZipqFloat32(src0, src1); - std::vector naive0; - std::vector naive1; - naive0.push_back(s0[0]); - naive0.push_back(s1[0]); - naive0.push_back(s0[1]); - naive0.push_back(s1[1]); - naive1.push_back(s0[2]); - naive1.push_back(s1[2]); - naive1.push_back(s0[3]); - naive1.push_back(s1[3]); + std::vector naive; + naive.push_back(s0[0]); + naive.push_back(s1[0]); + naive.push_back(s0[1]); + naive.push_back(s1[1]); + naive.push_back(s0[2]); + naive.push_back(s1[2]); + naive.push_back(s0[3]); + naive.push_back(s1[3]); - assert_eq((float*)&ret, naive0); - assert_eq((float*)&ret + SIMD_LEN, naive1); + assert_eq((float*)&ret, naive, SIMD_LEN * 2); } TEST_F(FALLBACK, GiInterleaveLowFloat32) { @@ -789,6 +1643,7 @@ TEST_F(FALLBACK, GiInterleaveLowFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiInterleaveLowFloat32(src0, src1); std::vector naive; @@ -810,6 +1665,7 @@ TEST_F(FALLBACK, GiInterleaveHighFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiInterleaveHighFloat32(src0, src1); std::vector naive; @@ -831,6 +1687,7 @@ TEST_F(FALLBACK, GiAddFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAddFloat32(src0, src1); std::vector naive; @@ -851,6 +1708,7 @@ TEST_F(FALLBACK, GiSubtractFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiSubtractFloat32(src0, src1); std::vector naive; @@ -871,6 +1729,7 @@ TEST_F(FALLBACK, GiMultiplyFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyFloat32(src0, src1); std::vector naive; @@ -890,6 +1749,7 @@ TEST_F(FALLBACK, GiMultiplyScalerFloat32) { float scalar = 3.1415; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyScalerFloat32(src0, scalar); std::vector naive; @@ -913,6 +1773,7 @@ TEST_F(FALLBACK, GiMultiplyAddFloat32) { init((float*)&src1, s1); init((float*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyAddFloat32(src0, src1, src2); std::vector naive; @@ -935,6 +1796,7 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) { float scalar = 3.1415; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyAddScalarFloat32(src0, src1, scalar); std::vector naive; @@ -968,6 +1830,7 @@ TEST_F(FALLBACK, GiMultiplyAddLanXXFloat32) { }; #define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ ret = GiMultiplyAddLan##n##Float32(src0, src1, src2); \ compare(n); @@ -987,6 +1850,7 @@ TEST_F(FALLBACK, GiDivideFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiDivideFloat32(src0, src1); std::vector naive; @@ -1007,6 +1871,7 @@ TEST_F(FALLBACK, GiRecpeSFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiRecpeSFloat32(src0, src1); std::vector naive; @@ -1024,6 +1889,7 @@ TEST_F(FALLBACK, GiRecpeFloat32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiRecpeFloat32(src0); std::vector naive; @@ -1041,6 +1907,7 @@ TEST_F(FALLBACK, GiNegFloat32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiNegFloat32(src0); std::vector naive; @@ -1055,13 +1922,14 @@ TEST_F(FALLBACK, GiNegFloat32) { TEST_F(FALLBACK, GiGreaterThanFloat32) { GI_FLOAT32_t src0, src1; GI_UINT32_t ret; - std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; + std::vector s0{1.1f, 2.2f, 3.59f, 4.9f}; std::vector s1{2312.1f, 0.1f, 3.59f, -12.8f}; s0.resize(SIMD_LEN); s1.resize(SIMD_LEN); init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiGreaterThanFloat32(src0, src1); std::vector naive; @@ -1076,13 +1944,14 @@ TEST_F(FALLBACK, GiGreaterThanFloat32) { TEST_F(FALLBACK, GiLessThanEqFloat32) { GI_FLOAT32_t src0, src1; GI_UINT32_t ret; - std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; + std::vector s0{1.1f, 2.2f, 3.59f, 4.9f}; std::vector s1{2312.1f, 0.1f, 3.59f, -12.8f}; s0.resize(SIMD_LEN); s1.resize(SIMD_LEN); init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLessThanEqFloat32(src0, src1); std::vector naive; @@ -1104,6 +1973,7 @@ TEST_F(FALLBACK, GiLessThanFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLessThanFloat32(src0, src1); std::vector naive; @@ -1124,6 +1994,7 @@ TEST_F(FALLBACK, GiAndFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndFloat32(src0, src1); std::vector naive; @@ -1150,6 +2021,7 @@ TEST_F(FALLBACK, GiOrFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiOrFloat32(src0, src1); std::vector naive; @@ -1176,6 +2048,7 @@ TEST_F(FALLBACK, GiAndNotFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndNotFloat32(src0, src1); std::vector naive; @@ -1202,6 +2075,7 @@ TEST_F(FALLBACK, GiXorFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiXorFloat32(src0, src1); std::vector naive; @@ -1235,12 +2109,13 @@ TEST_F(FALLBACK, GiBSLFloat32) { for (auto& s2 : s2s) { init((uint32_t*)&mask, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBSLFloat32(mask, src0, src1); na = GiBlendFloat32(src0, src1, GiReintUint32ToFloat32(mask)); std::vector naive; naive.resize(SIMD_LEN); - memcpy(naive.data(), &na, sizeof(GI_FLOAT32_t)); + memcpy(naive.data(), &na, GI_SIMD_LEN_BYTE); assert_eq_and_nan((float*)&ret, naive); } @@ -1255,6 +2130,7 @@ TEST_F(FALLBACK, GiMaximumFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMaximumFloat32(src0, src1); std::vector naive; @@ -1274,6 +2150,7 @@ TEST_F(FALLBACK, GiMinimumFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMinimumFloat32(src0, src1); std::vector naive; @@ -1293,6 +2170,7 @@ TEST_F(FALLBACK, GiMaxNanFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMaxNanFloat32(src0, src1); std::vector naive; @@ -1313,6 +2191,7 @@ TEST_F(FALLBACK, GiMinNanFloat32) { init((float*)&src0, s0); init((float*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMinNanFloat32(src0, src1); std::vector naive; @@ -1341,12 +2220,13 @@ TEST_F(FALLBACK, GiClampFloat32) { Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); return Value; }; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiClampFloat32(src0, LowerRange, UpperRange); na = naive_c(src1, LowerRange, UpperRange); std::vector naive; naive.resize(SIMD_LEN); - memcpy(naive.data(), &na, sizeof(GI_FLOAT32_t)); + memcpy(naive.data(), &na, GI_SIMD_LEN_BYTE); assert_eq((float*)&ret, naive); } @@ -1437,6 +2317,7 @@ TEST_F(FALLBACK, GiAbsFloat32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAbsFloat32(src0); std::vector naive; @@ -1453,8 +2334,8 @@ TEST_F(FALLBACK, GiZip1qS64) { std::vector s1{23424245, -4234234242232}; s0.resize(SIMD_LEN / 2); s1.resize(SIMD_LEN / 2); - memcpy(&src0, s0.data(), sizeof(GI_INT64_t)); - memcpy(&src1, s1.data(), sizeof(GI_INT64_t)); + memcpy(&src0, s0.data(), GI_SIMD_LEN_BYTE); + memcpy(&src1, s1.data(), GI_SIMD_LEN_BYTE); ret = GiZip1qS64(src0, src1); @@ -1472,8 +2353,8 @@ TEST_F(FALLBACK, GiZip2qS64) { std::vector s1{23424245, -4234234242232}; s0.resize(SIMD_LEN / 2); s1.resize(SIMD_LEN / 2); - memcpy(&src0, s0.data(), sizeof(GI_INT64_t)); - memcpy(&src1, s1.data(), sizeof(GI_INT64_t)); + memcpy(&src0, s0.data(), GI_SIMD_LEN_BYTE); + memcpy(&src1, s1.data(), GI_SIMD_LEN_BYTE); ret = GiZip2qS64(src0, src1); @@ -1490,13 +2371,14 @@ TEST_F(FALLBACK, GiReinterpretqS64ToFloat32) { GI_FLOAT32_t ret; std::vector s0{234242423424245, 42342342422323}; s0.resize(SIMD_LEN / 2); - memcpy(&src0, s0.data(), sizeof(GI_INT64_t)); + memcpy(&src0, s0.data(), GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReinterpretqS64ToFloat32(src0); std::vector naive; naive.resize(SIMD_LEN); - memcpy(naive.data(), s0.data(), sizeof(GI_FLOAT32_t)); + memcpy(naive.data(), s0.data(), GI_SIMD_LEN_BYTE); assert_eq((float*)&ret, naive); } @@ -1508,11 +2390,12 @@ TEST_F(FALLBACK, GiReinterpretqFloat32ToS64) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReinterpretqFloat32ToS64(src0); std::vector naive; naive.resize(SIMD_LEN); - memcpy(naive.data(), s0.data(), sizeof(GI_INT64_t)); + memcpy(naive.data(), s0.data(), GI_SIMD_LEN_BYTE); assert_eq((float*)&ret, naive); } @@ -1538,8 +2421,9 @@ TEST_F(FALLBACK, GiSimdFmaLane) { assert_eq((float*)&ret, naive); }; -#define CB(n) \ - ret = GiSimdFmaLane(src0, src1, src2, n); \ +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiSimdFmaLane(src0, src1, src2, n); \ compare(n); CB(0) @@ -1571,6 +2455,7 @@ TEST_F(FALLBACK, GiMlaqLowLaneFloat32) { }; #define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ ret = GiMlaqLowLaneFloat32(src0, src1, src2, n); \ compare(n); @@ -1601,6 +2486,7 @@ TEST_F(FALLBACK, GiMlaqHighLaneFloat32) { }; #define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ ret = GiMlaqHighLaneFloat32(src0, src1, src2, n); \ compare(n); @@ -1630,8 +2516,9 @@ TEST_F(FALLBACK, GiFmsqLaneQFloat32) { assert_eq((float*)&ret, naive); }; -#define CB(n) \ - ret = GiFmsqLaneQFloat32(src0, src1, src2, n); \ +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiFmsqLaneQFloat32(src0, src1, src2, n); \ compare(n); CB(0) @@ -1645,6 +2532,7 @@ TEST_F(FALLBACK, GiBroadcastUint32) { int32_t src0 = 20220422; GI_UINT32_t ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBroadcastUint32(src0); std::vector naive; @@ -1659,6 +2547,7 @@ TEST_F(FALLBACK, GiLoadInt32) { std::vector s0{1, 2, -200, 999}; GI_INT32_t ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadInt32(s0.data()); std::vector naive; @@ -1673,6 +2562,7 @@ TEST_F(FALLBACK, GiLoadInt16) { std::vector s0{1, 2, -200, 32767, -32768, 45, 3, 0}; GI_INT16_t ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadInt16(s0.data()); auto p = (int16_t*)&ret; @@ -1686,6 +2576,7 @@ TEST_F(FALLBACK, GiLoadInt8) { 11, 2, -128, 127, 2, 55, 3, -1}; GI_INT8_t ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiLoadInt8(s0.data()); auto p = (int8_t*)&ret; @@ -1700,7 +2591,7 @@ TEST_F(FALLBACK, GiStoreInt32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); - std::vector ret; + std::vector ret{0}; ret.resize(SIMD_LEN); GiStoreInt32(ret.data(), src0); @@ -1732,10 +2623,11 @@ TEST_F(FALLBACK, GiReinterInt32ToInt8) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiReinterInt32ToInt8(src0); - naive = (GI_INT8_t)src0; + memcpy(&naive, &src0, GI_SIMD_LEN_BYTE); - ASSERT_FALSE(memcmp(&ret, &naive, sizeof(GI_INT8_t))); + ASSERT_FALSE(memcmp(&ret, &naive, GI_SIMD_LEN_BYTE)); } TEST_F(FALLBACK, GiStoreInt16) { @@ -1744,7 +2636,7 @@ TEST_F(FALLBACK, GiStoreInt16) { s0.resize(SIMD_LEN_16); init((int16_t*)&src0, s0, SIMD_LEN_16); - std::vector ret; + std::vector ret{0}; ret.resize(SIMD_LEN_16); GiStoreInt16(ret.data(), src0); @@ -1757,7 +2649,7 @@ TEST_F(FALLBACK, GiStoreInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); - std::vector ret; + std::vector ret{0}; ret.resize(SIMD_LEN_8); GiStoreInt8(ret.data(), src0); @@ -1770,7 +2662,7 @@ TEST_F(FALLBACK, GiStoreLowInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); - std::vector ret; + std::vector ret{0}; ret.resize(SIMD_LEN_8 / 2); GiStoreLowInt8(ret.data(), src0); @@ -1783,7 +2675,7 @@ TEST_F(FALLBACK, GiStoreHihgInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); - std::vector ret; + std::vector ret{0}; ret.resize(SIMD_LEN_8 / 2); GiStoreHihgInt8(ret.data(), src0); @@ -1803,6 +2695,7 @@ TEST_F(FALLBACK, GiNegInt32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiNegInt32(src0); std::vector naive; @@ -1835,6 +2728,7 @@ TEST_F(FALLBACK, GiNegInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiNegInt8(src0); std::vector naive; @@ -1858,6 +2752,7 @@ TEST_F(FALLBACK, GiTestAndSetUint32) { init((uint32_t*)&src0, s0); init((uint32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiTestAndSetUint32(src0, src1); std::vector naive; @@ -1877,6 +2772,7 @@ TEST_F(FALLBACK, GiAddInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAddInt32(src0, src1); std::vector naive; @@ -1896,6 +2792,7 @@ TEST_F(FALLBACK, GiAddUint32) { init((uint32_t*)&src0, s0); init((uint32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAddUint32(src0, src1); std::vector naive; @@ -1923,6 +2820,7 @@ TEST_F(FALLBACK, GiAddInt16) { init((int16_t*)&src0, s0, SIMD_LEN_16); init((int16_t*)&src1, s1, SIMD_LEN_16); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAddInt16(src0, src1); std::vector naive; @@ -1974,6 +2872,7 @@ TEST_F(FALLBACK, GiAddInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAddInt8(src0, src1); std::vector naive; @@ -1993,6 +2892,7 @@ TEST_F(FALLBACK, GiSubtractInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiSubtractInt32(src0, src1); std::vector naive; @@ -2012,6 +2912,7 @@ TEST_F(FALLBACK, GiSubtractUint32) { init((uint32_t*)&src0, s0); init((uint32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiSubtractUint32(src0, src1); std::vector naive; @@ -2063,6 +2964,7 @@ TEST_F(FALLBACK, GiSubtractInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiSubtractInt8(src0, src1); std::vector naive; @@ -2082,6 +2984,7 @@ TEST_F(FALLBACK, GiMultiplyInt32) { init((int32_t*)&src0, s0); init((int32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyInt32(src0, src1); std::vector naive; @@ -2133,6 +3036,7 @@ TEST_F(FALLBACK, GiMultiplyInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyInt8(src0, src1); std::vector naive; @@ -2155,6 +3059,7 @@ TEST_F(FALLBACK, GiMultiplyAddInt32) { init((int32_t*)&src1, s1); init((int32_t*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyAddInt32(src0, src1, src2); std::vector naive; @@ -2225,6 +3130,7 @@ TEST_F(FALLBACK, GiMultiplyAddInt8) { init((int8_t*)&src1, s1, SIMD_LEN_8); init((int8_t*)&src2, s2, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMultiplyAddInt8(src0, src1, src2); std::vector naive; @@ -2276,6 +3182,7 @@ TEST_F(FALLBACK, GiAndInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndInt8(src0, src1); std::vector naive; @@ -2295,6 +3202,7 @@ TEST_F(FALLBACK, GiEOrUint32) { init((uint32_t*)&src0, s0); init((uint32_t*)&src1, s1); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiEOrUint32(src0, src1); std::vector naive; @@ -2346,6 +3254,7 @@ TEST_F(FALLBACK, GiOrInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiOrInt8(src0, src1); std::vector naive; @@ -2397,6 +3306,7 @@ TEST_F(FALLBACK, GiAndNotInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAndNotInt8(src0, src1); std::vector naive; @@ -2448,6 +3358,7 @@ TEST_F(FALLBACK, GiXorInt8) { init((int8_t*)&src0, s0, SIMD_LEN_8); init((int8_t*)&src1, s1, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiXorInt8(src0, src1); std::vector naive; @@ -2464,6 +3375,7 @@ TEST_F(FALLBACK, GiShiftRight23Int32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiShiftRight23Int32(src0); std::vector naive; @@ -2474,6 +3386,23 @@ TEST_F(FALLBACK, GiShiftRight23Int32) { assert_eq((int32_t*)&ret, naive); } +TEST_F(FALLBACK, GiShiftLeft23Int32) { + GI_INT32_t src0, ret; + std::vector s0{1, 2, 3, -4}; + s0.resize(SIMD_LEN); + init((int32_t*)&src0, s0); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiShiftLeft23Int32(src0); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + naive.push_back(s0[i] << 23); + } + + assert_eq((int32_t*)&ret, naive); +} + TEST_F(FALLBACK, GiBlendInt32) { GI_INT32_t src0, src1, src2, ret, na; std::vector s0{1, 2, 3, -4}; @@ -2486,6 +3415,7 @@ TEST_F(FALLBACK, GiBlendInt32) { init((int32_t*)&src1, s1); init((int32_t*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBlendInt32(src0, src1, src2); na = GiOrInt32(GiAndInt32(src1, src2), GiAndNotInt32(src2, src0)); @@ -2559,6 +3489,7 @@ TEST_F(FALLBACK, GiBlendInt8) { init((int8_t*)&src1, s1, SIMD_LEN_8); init((int8_t*)&src2, s2, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBlendInt8(src0, src1, src2); na = GiOrInt8(GiAndInt8(src1, src2), GiAndNotInt8(src2, src0)); @@ -2577,6 +3508,7 @@ TEST_F(FALLBACK, GiAbsInt32) { s0.resize(SIMD_LEN); init((int32_t*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAbsInt32(src0); std::vector naive; @@ -2594,6 +3526,7 @@ TEST_F(FALLBACK, GiAbsInt16) { s0.resize(SIMD_LEN_16); init((int16_t*)&src0, s0, SIMD_LEN_16); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAbsInt16(src0); std::vector naive; @@ -2626,6 +3559,7 @@ TEST_F(FALLBACK, GiAbsInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiAbsInt8(src0); std::vector naive; @@ -2652,6 +3586,7 @@ TEST_F(FALLBACK, GiMaximumInt32) { s2.resize(SIMD_LEN); init((int32_t*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMaximumInt32(src0, src1); na = GiBlendInt32(src1, src0, src2); @@ -2680,6 +3615,7 @@ TEST_F(FALLBACK, GiMinimumInt32) { s2.resize(SIMD_LEN); init((int32_t*)&src2, s2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMinimumInt32(src0, src1); na = GiBlendInt32(src1, src0, src2); @@ -2752,6 +3688,7 @@ TEST_F(FALLBACK, GiBlendInt8x16) { init((int8_t*)&src1, s1, SIMD_LEN_8); init((int8_t*)&src2, s2, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiBlendInt8x16(src0, src1, src2); na = GiOrInt8(GiAndInt8(src1, src2), GiAndNotInt8(src2, src0)); @@ -2811,6 +3748,7 @@ TEST_F(FALLBACK, GiMaximumInt8) { } s2.resize(SIMD_LEN_8); init((int8_t*)&src2, s2, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMaximumInt8(src0, src1); na = GiBlendInt8(src1, src0, src2); @@ -2871,6 +3809,7 @@ TEST_F(FALLBACK, GiMinimumInt8) { } s2.resize(SIMD_LEN_8); init((int8_t*)&src2, s2, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMinimumInt8(src0, src1); na = GiBlendInt8(src1, src0, src2); @@ -2909,6 +3848,7 @@ TEST_F(FALLBACK, GiMoveHighLongInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMoveHighLongInt8(src0); std::vector naive; @@ -2944,6 +3884,7 @@ TEST_F(FALLBACK, GiMoveLowLongInt8) { s0.resize(SIMD_LEN_8); init((int8_t*)&src0, s0, SIMD_LEN_8); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMoveLowLongInt8(src0); std::vector naive; @@ -2962,6 +3903,7 @@ TEST_F(FALLBACK, GiMoveHighLongInt16) { s0.resize(SIMD_LEN_16); init((int16_t*)&src0, s0, SIMD_LEN_16); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMoveHighLongInt16(src0); std::vector naive; @@ -2980,6 +3922,7 @@ TEST_F(FALLBACK, GiMoveLowLongInt16) { s0.resize(SIMD_LEN_16); init((int16_t*)&src0, s0, SIMD_LEN_16); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiMoveLowLongInt16(src0); std::vector naive; @@ -2999,7 +3942,7 @@ TEST_F(FALLBACK, GiReduceAddInt8) { ret = GiReduceAddInt8(src0); - int32_t naive{0}; + int16_t naive{0}; for (auto i : s0) { naive += i; } @@ -3050,6 +3993,7 @@ TEST_F(FALLBACK, GiCvtFromFloat32ToInt8) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCvtFromFloat32ToInt8(src0); std::vector naive; @@ -3081,6 +4025,7 @@ TEST_F(FALLBACK, GiCvtFromFloat32V2ToInt8) { s0.resize(SIMD_LEN * 2); init((float*)&src0, s0, SIMD_LEN * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCvtFromFloat32V2ToInt8(src0); std::vector naive; @@ -3119,6 +4064,7 @@ TEST_F(FALLBACK, GiCvtFromFloat32V4ToInt8) { s0.resize(SIMD_LEN * 4); init((float*)&src0, s0, SIMD_LEN * 4); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCvtFromFloat32V4ToInt8(src0); std::vector naive; @@ -3135,9 +4081,10 @@ TEST_F(FALLBACK, GiCombineFloat32) { GI_FLOAT32_t ret; std::vector s0{1.1f, -3.1415f}; std::vector s1{2.3f, 3.14777f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); - memcpy(&src1, s1.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); + memcpy(&src1, s1.data(), sizeof(float) * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); ret = GiCombineFloat32(src0, src1); std::vector naive; @@ -3156,6 +4103,7 @@ TEST_F(FALLBACK, GiGetLowFloat32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiGetLowFloat32(src0); auto r = (float*)&ret; @@ -3170,6 +4118,7 @@ TEST_F(FALLBACK, GiGetHighFloat32) { s0.resize(SIMD_LEN); init((float*)&src0, s0); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiGetHighFloat32(src0); auto r = (float*)&ret; @@ -3181,9 +4130,10 @@ TEST_F(FALLBACK, GiPaddFloat32) { float32x2_t src0, src1, ret; std::vector s0{1.1f, -3.1415f}; std::vector s1{2.3f, 3.14777f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); - memcpy(&src1, s1.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); + memcpy(&src1, s1.data(), sizeof(float) * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiPaddFloat32(src0, src1); std::vector naive; @@ -3199,9 +4149,10 @@ TEST_F(FALLBACK, GiPmaxFloat32) { float32x2_t src0, src1, ret; std::vector s0{1.1f, -3.1415f}; std::vector s1{2.3f, 3.14777f}; - memcpy(&src0, s0.data(), sizeof(float32x2_t)); - memcpy(&src1, s1.data(), sizeof(float32x2_t)); + memcpy(&src0, s0.data(), sizeof(float) * 2); + memcpy(&src1, s1.data(), sizeof(float) * 2); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE / 2); ret = GiPmaxFloat32(src0, src1); std::vector naive; @@ -3228,9 +4179,10 @@ TEST_F(FALLBACK, GiStoreZipFloat32V2) { GiStoreZipFloat32V2(ret.data(), src0); GI_FLOAT32_V2_t tmp; - tmp = GiZipqFloat32(src0.val[0], src0.val[1]); - GiStoreFloat32(ret_cmp.data(), tmp.val[0]); - GiStoreFloat32(ret_cmp.data() + SIMD_LEN, tmp.val[1]); + tmp = GiZipqFloat32( + GiGetSubVectorFloat32V2(src0, 0), GiGetSubVectorFloat32V2(src0, 1)); + GiStoreFloat32(ret_cmp.data(), GiGetSubVectorFloat32V2(tmp, 0)); + GiStoreFloat32(ret_cmp.data() + SIMD_LEN, GiGetSubVectorFloat32V2(tmp, 1)); assert_eq(ret.data(), ret_cmp, SIMD_LEN * 2); } @@ -3241,6 +4193,7 @@ TEST_F(FALLBACK, GiLoadUzipFloat32V3) { 3.59f, -12.8f, 2.2f, 6.0f, 90.0f, 89.3f}; s0.resize(SIMD_LEN * 3); + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE * 3); ret = GiLoadUzipFloat32V3(s0.data()); std::vector naive; for (size_t i = 0; i < 3; i++) { @@ -3258,7 +4211,17 @@ TEST_F(FALLBACK, GiStoreZipFloat32V3) { std::vector s0{1.1f, 2.2f, 3.5f, 4.9f, 2312.1f, 345.244f, 3.59f, -12.8f, 3.59f, -12.8f, 2.2f, 6.0}; s0.resize(SIMD_LEN * 3); + //! rvv compiler crash when use init on type_x3, use rvv load api as a workaround +#if defined(GI_RVV_INTRINSICS) + vfloat32m1_t t00, t10, t20; + t00 = vle32_v_f32m1(s0.data(), SIMD_LEN); + t10 = vle32_v_f32m1(s0.data() + SIMD_LEN, 4); + t20 = vle32_v_f32m1(s0.data() + SIMD_LEN * 2, 4); + src0 = vcreate_f32m1x3(t00, t10, t20); +#else init((float*)&src0, s0, SIMD_LEN * 3); +#endif + std::vector ret; ret.resize(SIMD_LEN * 3); @@ -3274,6 +4237,27 @@ TEST_F(FALLBACK, GiStoreZipFloat32V3) { assert_eq(ret.data(), ret_cmp, SIMD_LEN * 3); } +TEST_F(FALLBACK, GiDivFloat32) { + GI_FLOAT32_t src0, src1, ret; + std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; + std::vector s1{2312.1f, 345.244f, 3.59f, -12.8f}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + init((float*)&src0, s0); + init((float*)&src1, s1); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiDivFloat32(src0, src1); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN; i++) { + naive.push_back(s0[i] / s1[i]); + } + + assert_lt((float*)&ret, naive, 1e-3); +} + } // namespace test } // namespace megdnn diff --git a/dnn/test/main.cpp b/dnn/test/main.cpp index 0b4aa84c..9049b847 100644 --- a/dnn/test/main.cpp +++ b/dnn/test/main.cpp @@ -1,5 +1,19 @@ +#if defined(ONLY_BUILD_GI_API) +#include + +int gtest_main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + auto ret = RUN_ALL_TESTS(); + return ret; +} + +int main(int argc, char** argv) { + return gtest_main(argc, argv); +} +#else extern "C" int gtest_main(int argc, char** argv); int main(int argc, char** argv) { return gtest_main(argc, argv); } +#endif -- GitLab