diff --git a/dnn/src/fallback/general_intrinsic/gi_int.h b/dnn/src/fallback/general_intrinsic/gi_int.h index fd37547f0ea08763cbf9691d65d70ffaf3427464..e077e2b5b8e29f949467083dcca455cd4568003d 100644 --- a/dnn/src/fallback/general_intrinsic/gi_int.h +++ b/dnn/src/fallback/general_intrinsic/gi_int.h @@ -122,64 +122,43 @@ GI_FORCEINLINE GI_INT8_V3_t GiLoadUzipInt8V3(const void* Buffer) { #if defined(GI_NEON_INTRINSICS) return vld3q_s8((int8_t*)Buffer); -#elif defined(GI_SSE42_INTRINSICS) - GI_INT8_V3_t v; - __m128i tmp0, tmp1, tmp2, tmp3; - static const int8_t mask8_0[16] = {0, 3, 6, 9, 12, 15, 1, 4, - 7, 10, 13, 2, 5, 8, 11, 14}; - static const int8_t mask8_1[16] = {2, 5, 8, 11, 14, 0, 3, 6, - 9, 12, 15, 1, 4, 7, 10, 13}; - static const int8_t mask8_2[16] = {1, 4, 7, 10, 13, 2, 5, 8, - 11, 14, 0, 3, 6, 9, 12, 15}; +#elif defined(GI_SSE2_INTRINSICS) + GI_INT8_V3_t ret; + __m128i t00 = _mm_loadu_si128((const __m128i*)Buffer); + __m128i t01 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 16)); + __m128i t02 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 32)); - v.val[0] = _mm_loadu_si128((const __m128i*)Buffer); - v.val[1] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 16)); - v.val[2] = _mm_loadu_si128((const __m128i*)((int8_t*)Buffer + 32)); + __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); - tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); - tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); - tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); - - tmp3 = _mm_slli_si128(tmp0, 10); - tmp3 = _mm_alignr_epi8(tmp1, tmp3, 10); // a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x - tmp3 = _mm_slli_si128(tmp3, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, - tmp3 = _mm_srli_si128(tmp3, 5); // a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 - v.val[0] = _mm_slli_si128(tmp2, 11); // 0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, - v.val[0] = _mm_or_si128(v.val[0], tmp3); - - tmp3 = _mm_slli_si128(tmp0, 5); // 0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, - tmp3 = _mm_srli_si128(tmp3, 11); // a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 - v.val[1] = _mm_srli_si128(tmp1, 5); // b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 - v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, - v.val[1] = _mm_or_si128(v.val[1], tmp3); - v.val[1] = _mm_slli_si128(v.val[1], 5); // 0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, - v.val[1] = _mm_srli_si128(v.val[1], 5); // a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 - tmp3 = _mm_srli_si128(tmp2, 5); // c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 - tmp3 = _mm_slli_si128(tmp3, 11); // 0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, - v.val[1] = _mm_or_si128(v.val[1], tmp3); - tmp3 = _mm_srli_si128(tmp2, 10); // c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, - tmp3 = _mm_slli_si128(tmp3, 10); // 0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, - v.val[2] = _mm_srli_si128(tmp1, 11); // b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 - v.val[2] = _mm_slli_si128(v.val[2], 5); // 0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 - v.val[2] = - _mm_or_si128(v.val[2], tmp3); // 0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, - tmp0 = _mm_srli_si128(tmp0, 11); // a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, - v.val[2] = _mm_or_si128(v.val[2], tmp0); - return v; + __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); + + __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); + __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); + __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); + + ret.val[0] = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); + ret.val[1] = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); + ret.val[2] = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); + return ret; #elif defined(GI_RVV_INTRINSICS) return vlseg3e8_v_i8m1x3((int8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else - int8_t data[3 * GI_SIMD_LEN_BYTE]; - const int8_t* ptr = (int8_t*)Buffer; - for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { - data[i] = ptr[3 * i]; - data[GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 1]; - data[2 * GI_SIMD_LEN_BYTE + i] = ptr[3 * i + 2]; - } GI_INT8_V3_t ret; - ret.val[0] = GiLoadInt8(data); - ret.val[1] = GiLoadInt8(data + GI_SIMD_LEN_BYTE); - ret.val[2] = GiLoadInt8(data + 2 * GI_SIMD_LEN_BYTE); + GI_INT8_t ret0, ret1, ret2; + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++, i3 += 3) { + ret0[i] = *((int8_t*)Buffer + i3); + ret1[i] = *((int8_t*)Buffer + i3 + 1); + ret2[i] = *((int8_t*)Buffer + i3 + 2); + } + ret.val[0] = ret0; + ret.val[1] = ret1; + ret.val[2] = ret2; + return ret; #endif } @@ -346,7 +325,7 @@ void GiStoreLowInt8(void* Buffer, GI_INT8_t Vector) { } GI_FORCEINLINE -void GiStoreHihgInt8(void* Buffer, GI_INT8_t Vector) { +void GiStoreHighInt8(void* Buffer, GI_INT8_t Vector) { #if defined(GI_NEON_INTRINSICS) vst1_s8((int8_t*)Buffer, vget_high_s8(Vector)); #elif defined(GI_SSE2_INTRINSICS) @@ -854,7 +833,7 @@ GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { int16_t data[8]; int8_t o_data[16]; _mm_storeu_si128((__m128i*)o_data, Vector); - for (int i = 0; i < 8; i++) { + for (size_t i = 0; i < 8; i++) { data[i] = o_data[8 + i]; } return _mm_loadu_si128((__m128i*)data); @@ -882,7 +861,7 @@ GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { int16_t data[8]; int8_t o_data[16]; _mm_storeu_si128((__m128i*)o_data, Vector); - for (int i = 0; i < 8; i++) { + for (size_t i = 0; i < 8; i++) { data[i] = o_data[i]; } return _mm_loadu_si128((__m128i*)data); @@ -909,7 +888,7 @@ GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { int32_t data[4]; int16_t o_data[8]; _mm_storeu_si128((__m128i*)o_data, Vector); - for (int i = 0; i < 4; i++) { + for (size_t i = 0; i < 4; i++) { data[i] = o_data[4 + i]; } return _mm_loadu_si128((__m128i*)data); @@ -936,7 +915,7 @@ GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { int32_t data[4]; int16_t o_data[8]; _mm_storeu_si128((__m128i*)o_data, Vector); - for (int i = 0; i < 4; i++) { + for (size_t i = 0; i < 4; i++) { data[i] = o_data[i]; } return _mm_loadu_si128((__m128i*)data); @@ -1154,8 +1133,8 @@ GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { //! as a workaround, we imp this API by naive GI_INT8_NAIVE_t tmp_ret; GI_FLOAT32_FIXLEN_t s0 = GiFloat32Type2FixLenType(src); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < length; i++) { + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < length; i++) { int8_t data = Saturate(round(s0[i]), -128, 127); tmp_ret[i] = data; tmp_ret[length + i] = data; @@ -1168,8 +1147,8 @@ GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { GI_INT8_NAIVE_t tmp_ret; GI_FLOAT32_NAIVE_t s0; memcpy(&s0, &src, sizeof(GI_INT32_t)); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < length; i++) { + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < length; i++) { int8_t data = Saturate(round(s0[i]), -128, 127); tmp_ret[i] = data; tmp_ret[length + i] = data; @@ -1229,8 +1208,8 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { //! as a workaround, we imp this API by naive GI_INT8_NAIVE_t tmp_ret; GI_FLOAT32_FIXLEN_V2_t s0 = GiFloat32Type2FixLenV2Type(vsrc); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < 2 * length; i++) { + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < 2 * length; i++) { int8_t data = Saturate(round(s0.val[i / length][i % length]), -128, 127); tmp_ret[i] = data; tmp_ret[i + length * 2] = data; @@ -1241,8 +1220,8 @@ GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { GI_INT8_NAIVE_t tmp_ret; GI_FLOAT32_V2_NAIVE_t s0; memcpy(&s0, &vsrc, sizeof(GI_FLOAT32_V2_NAIVE_t)); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < 2 * length; i++) { + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < 2 * length; i++) { int8_t data = Saturate(round(s0.val[i / length][i % length]), -128, 127); tmp_ret[i] = data; tmp_ret[i + length * 2] = data; @@ -1322,9 +1301,10 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { s0.val[1] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 1)); s0.val[2] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 2)); s0.val[3] = GiFloat32Type2FixLenType(GiGetSubVectorFloat32V4(vsrc, 3)); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < 4 * length; i++) { - tmp_ret[i] = Saturate(round(s0.val[i / length][i % length]), -128, 127); + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < 4 * length; i++) { + tmp_ret[i] = + Saturate(round(s0.val[i / length][i % length]), INT8_MIN, INT8_MAX); } return vle8_v_i8m1((const signed char*)&tmp_ret, GI_SIMD_LEN_BYTE / sizeof(int8_t)); #else @@ -1332,13 +1312,1087 @@ GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { GI_INT8_NAIVE_t tmp_ret; GI_FLOAT32_V4_NAIVE_t s0; memcpy(&s0, &vsrc, sizeof(GI_FLOAT32_V4_NAIVE_t)); - int length = GI_SIMD_LEN_BYTE / sizeof(float); - for (int i = 0; i < 4 * length; i++) { - tmp_ret[i] = Saturate(round(s0.val[i / length][i % length]), -128, 127); + size_t length = GI_SIMD_LEN_BYTE / sizeof(float); + for (size_t i = 0; i < 4 * length; i++) { + tmp_ret[i] = + Saturate(round(s0.val[i / length][i % length]), INT8_MIN, INT8_MAX); } memcpy(&ret, &tmp_ret, sizeof(GI_INT8_t)); return ret; #endif } +GI_FORCEINLINE +GI_UINT8_t GiLoadUint8(const void* Buffer) { +#if defined(GI_NEON_INTRINSICS) + return vld1q_u8((uint8_t*)Buffer); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_loadu_si128((const __m128i*)Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle8_v_u8m1((uint8_t*)Buffer, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + const uint8_t* ptr = (uint8_t*)Buffer; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++) { + ret[i] = ptr[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE GI_UINT8_t GiReverseUint8(GI_UINT8_t a) { +#if defined(GI_NEON_INTRINSICS) + GI_UINT8_t vec = vrev64q_u8(a); + return vextq_u8(vec, vec, 8); +#elif defined(GI_SSE2_INTRINSICS) + char d[16]; + _mm_storeu_si128((__m128i*)d, a); + return _mm_setr_epi8( + d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], + d[4], d[3], d[2], d[1], d[0]); +#elif defined(GI_RVV_INTRINSICS) + vuint8m1_t index = vundefined_u8m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint8_t idx_num0[16] = {0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0}; + index = vle8_v_u8m1((uint8_t*)idx_num0, 16); +#else + uint8_t* index_p = (uint8_t*)&index; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(int8_t); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { + index_p[i] = offset - i - 1; + } +#endif + + return vrgather_vv_u8m1(a, index, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++) { + ret[i] = a[GI_SIMD_LEN_BYTE / sizeof(uint8_t) - i - 1]; + } + return ret; +#endif +} + +GI_FORCEINLINE +void GiStoreUint8(void* Buffer, GI_UINT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + vst1q_u8((uint8_t*)Buffer, Vector); +#elif defined(GI_SSE2_INTRINSICS) + _mm_storeu_si128((__m128i*)Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse8_v_u8m1((uint8_t*)Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + uint8_t* ptr = (uint8_t*)Buffer; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++) { + ptr[i] = Vector[i]; + } +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiLoadUzip0V3Uint8(const void* Buffer) { +#if defined(GI_NEON_INTRINSICS) + uint8x16x3_t vec = vld3q_u8((uint8_t*)Buffer); + return vec.val[0]; +#elif defined(GI_SSE2_INTRINSICS) + __m128i t00 = _mm_loadu_si128((const __m128i*)Buffer); + __m128i t01 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 16)); + __m128i t02 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 32)); + + __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); + + __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); + + __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); + __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); + + return _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31)); +#elif defined(GI_RVV_INTRINSICS) + return vlse8_v_u8m1((uint8_t*)Buffer, 3, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++, i3 += 3) { + ret[i] = *((uint8_t*)Buffer + i3); + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiLoadUzip1V3Uint8(const void* Buffer) { +#if defined(GI_NEON_INTRINSICS) + uint8x16x3_t vec = vld3q_u8((uint8_t*)Buffer); + return vec.val[1]; +#elif defined(GI_SSE2_INTRINSICS) + __m128i t00 = _mm_loadu_si128((const __m128i*)Buffer); + __m128i t01 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 16)); + __m128i t02 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 32)); + + __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); + + __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); + + __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21)); + __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); + + return _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32); +#elif defined(GI_RVV_INTRINSICS) + return vlse8_v_u8m1((uint8_t*)Buffer + 1, 3, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++, i3 += 3) { + ret[i] = *((uint8_t*)Buffer + i3 + 1); + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiLoadUzip2V3Uint8(const void* Buffer) { +#if defined(GI_NEON_INTRINSICS) + uint8x16x3_t vec = vld3q_u8((uint8_t*)Buffer); + return vec.val[2]; +#elif defined(GI_SSE2_INTRINSICS) + __m128i t00 = _mm_loadu_si128((const __m128i*)Buffer); + __m128i t01 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 16)); + __m128i t02 = _mm_loadu_si128((const __m128i*)((uint8_t*)Buffer + 32)); + + __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01)); + __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02); + __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02)); + + __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11)); + __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12); + __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12)); + + __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22); + __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22)); + + return _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32)); +#elif defined(GI_RVV_INTRINSICS) + return vlse8_v_u8m1((uint8_t*)Buffer + 2, 3, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++, i3 += 3) { + ret[i] = *((uint8_t*)Buffer + i3 + 2); + } + return ret; +#endif +} + +GI_FORCEINLINE +void GiStoreZipUint8V3(void* Buffer, GI_UINT8_t a, GI_UINT8_t b, GI_UINT8_t c) { +#if defined(GI_NEON_INTRINSICS) + uint8x16x3_t vec; + vec.val[0] = a; + vec.val[1] = b; + vec.val[2] = c; + vst3q_u8((uint8_t*)Buffer, vec); +#elif defined(GI_SSE2_INTRINSICS) + __m128i z = _mm_setzero_si128(); + __m128i ab0 = _mm_unpacklo_epi8(a, b); + __m128i ab1 = _mm_unpackhi_epi8(a, b); + __m128i c0 = _mm_unpacklo_epi8(c, z); + __m128i c1 = _mm_unpackhi_epi8(c, z); + + __m128i p00 = _mm_unpacklo_epi16(ab0, c0); + __m128i p01 = _mm_unpackhi_epi16(ab0, c0); + __m128i p02 = _mm_unpacklo_epi16(ab1, c1); + __m128i p03 = _mm_unpackhi_epi16(ab1, c1); + + __m128i p10 = _mm_unpacklo_epi32(p00, p01); + __m128i p11 = _mm_unpackhi_epi32(p00, p01); + __m128i p12 = _mm_unpacklo_epi32(p02, p03); + __m128i p13 = _mm_unpackhi_epi32(p02, p03); + + __m128i p20 = _mm_unpacklo_epi64(p10, p11); + __m128i p21 = _mm_unpackhi_epi64(p10, p11); + __m128i p22 = _mm_unpacklo_epi64(p12, p13); + __m128i p23 = _mm_unpackhi_epi64(p12, p13); + + p20 = _mm_slli_si128(p20, 1); + p22 = _mm_slli_si128(p22, 1); + + __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); + __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); + __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); + __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); + + __m128i p40 = _mm_unpacklo_epi64(p30, p31); + __m128i p41 = _mm_unpackhi_epi64(p30, p31); + __m128i p42 = _mm_unpacklo_epi64(p32, p33); + __m128i p43 = _mm_unpackhi_epi64(p32, p33); + + __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); + __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); + __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); + + _mm_storeu_si128((__m128i*)(Buffer), v0); + _mm_storeu_si128((__m128i*)((uint8_t*)Buffer + 16), v1); + _mm_storeu_si128((__m128i*)((uint8_t*)Buffer + 32), v2); + +#elif defined(GI_RVV_INTRINSICS) + vsse8_v_u8m1((uint8_t*)Buffer, 3, a, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + vsse8_v_u8m1((uint8_t*)Buffer + 1, 3, b, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + vsse8_v_u8m1((uint8_t*)Buffer + 2, 3, c, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t); i++, i3 += 3) { + *((uint8_t*)Buffer + i3) = a[i]; + *((uint8_t*)Buffer + i3 + 1) = b[i]; + *((uint8_t*)Buffer + i3 + 2) = c[i]; + } +#endif +} + +#if defined(GI_NEON_INTRINSICS) +#define GiShiftRightInt16ToUint8(Vector, shift) \ + __extension__({ \ + uint8x8_t vec = vqshrun_n_s16(Vector, shift); \ + uint8x16_t _ret = vcombine_u8(vec, vec); \ + _ret; \ + }) +#elif defined(GI_SSE2_INTRINSICS) +#define GiShiftRightInt16ToUint8(Vector, shift) \ + __extension__({ \ + __m128i vec = _mm_srai_epi16(Vector, shift); \ + __m128i _ret = _mm_packus_epi16(vec, vec); \ + _ret; \ + }) +#elif defined(GI_RVV_INTRINSICS) +#define GiShiftRightInt16ToUint8(Vector, shift) \ + __extension__({ \ + vint16m1_t src1 = \ + vsra_vx_i16m1(Vector, shift, GI_SIMD_LEN_BYTE / sizeof(int16_t)); \ + vint16m2_t max, min, dest; \ + max = vmv_v_x_i16m2(UINT8_MAX, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + min = vmv_v_x_i16m2(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + vbool8_t mask; \ + dest = vset_v_i16m1_i16m2(dest, 0, src1); \ + dest = vset_v_i16m1_i16m2(dest, 1, src1); \ + mask = vmsgt_vv_i16m2_b8(dest, min, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + dest = vmerge_vvm_i16m2(mask, min, dest, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + mask = vmslt_vv_i16m2_b8(dest, max, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + dest = vmerge_vvm_i16m2(mask, max, dest, GI_SIMD_LEN_BYTE / sizeof(int8_t)); \ + vuint8m1_t _ret = vreinterpret_v_i8m1_u8m1( \ + vncvt_x_x_w_i8m1(dest, GI_SIMD_LEN_BYTE / sizeof(int8_t))); \ + _ret; \ + }) +#else +#define GiShiftRightInt16ToUint8(Vector, shift) \ + __extension__({ \ + GI_UINT8_t _ret; \ + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { \ + uint8_t val = Saturate(Vector[i] >> shift, 0, UINT8_MAX); \ + _ret[i] = val; \ + _ret[i + GI_SIMD_LEN_BYTE / sizeof(int16_t)] = val; \ + } \ + _ret; \ + }) +#endif + +GI_FORCEINLINE +GI_INT16_t GiCombineInt16Low(GI_INT16_t Vector0, GI_INT16_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vcombine_s16(vget_low_s16(Vector0), vget_low_s16(Vector1)); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi64(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + return vslideup_vx_i16m1( + Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(int16_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Vector0[i]; + ret[i + GI_SIMD_LEN_BYTE / sizeof(int32_t)] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiCombineUint8Low(GI_UINT8_t Vector0, GI_UINT8_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vcombine_u8(vget_low_u8(Vector0), vget_low_u8(Vector1)); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi64(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + return vslideup_vx_u8m1( + Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Vector0[i]; + ret[i + GI_SIMD_LEN_BYTE / sizeof(int16_t)] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiZipV0Int8(GI_INT8_t Vector0, GI_INT8_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s8(Vector0, Vector1).val[0]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi8(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int8_t mask_idx[16] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; + vint8m1_t mask_vec = + vle8_v_i8m1((int8_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vbool8_t mask = vmsgt_vv_i8m1_b8(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vuint8m1_t index0 = vundefined_u8m1(); + vuint8m1_t index1 = vundefined_u8m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint8_t idx_num0[16] = {0x0, 0x8, 0x1, 0x9, 0x2, 0xa, 0x3, 0xb, + 0x4, 0xc, 0x5, 0xd, 0x6, 0xe, 0x7, 0xf}; + uint8_t idx_num1[16] = {0x8, 0x0, 0x9, 0x1, 0xa, 0x2, 0xb, 0x3, + 0xc, 0x4, 0xd, 0x5, 0xe, 0x6, 0xf, 0x7}; + index0 = vle8_v_u8m1((uint8_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + index1 = vle8_v_u8m1((uint8_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + uint8_t* index_p0 = (uint8_t*)&index0; + uint8_t* index_p1 = (uint8_t*)&index1; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + + Vector0 = vrgather_vv_i8m1(Vector0, index0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + Vector1 = vrgather_vv_i8m1(Vector1, index1, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + return vmerge_vvm_i8m1(mask, Vector1, Vector0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_INT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); ++i) { + ret[2 * i] = Vector0[i]; + ret[2 * i + 1] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiZipV1Int8(GI_INT8_t Vector0, GI_INT8_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s8(Vector0, Vector1).val[1]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpackhi_epi8(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int8_t mask_idx[16] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + vint8m1_t mask_vec = + vle8_v_i8m1((int8_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vint8m1_t zero = vmv_v_x_i8m1(0, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vbool8_t mask = vmsgt_vv_i8m1_b8(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int8_t)); + vuint8m1_t index0 = vundefined_u8m1(); + vuint8m1_t index1 = vundefined_u8m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint8_t idx_num0[16] = {0x0, 0x8, 0x1, 0x9, 0x2, 0xa, 0x3, 0xb, + 0x4, 0xc, 0x5, 0xd, 0x6, 0xe, 0x7, 0xf}; + uint8_t idx_num1[16] = {0x8, 0x0, 0x9, 0x1, 0xa, 0x2, 0xb, 0x3, + 0xc, 0x4, 0xd, 0x5, 0xe, 0x6, 0xf, 0x7}; + index0 = vle8_v_u8m1((uint8_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + index1 = vle8_v_u8m1((uint8_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + uint8_t* index_p0 = (uint8_t*)&index0; + uint8_t* index_p1 = (uint8_t*)&index1; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + Vector0 = vrgather_vv_i8m1(Vector0, index1, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + Vector1 = vrgather_vv_i8m1(Vector1, index0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + return vmerge_vvm_i8m1(mask, Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_INT8_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); ++i) { + ret[2 * i] = Vector0[i + GI_SIMD_LEN_BYTE / sizeof(int16_t)]; + ret[2 * i + 1] = Vector1[i + GI_SIMD_LEN_BYTE / sizeof(int16_t)]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiReinterpretInt8AsInt16(GI_INT8_t In) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s16_s8(In); +#elif defined(GI_SSE2_INTRINSICS) + return (GI_INT16_t)In; +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_i8m1_i16m1(In); +#else + return (GI_INT16_t)In; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiZipV0Int16(GI_INT16_t Vector0, GI_INT16_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s16(Vector0, Vector1).val[0]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi16(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int16_t mask_idx[8] = {1, 0, 1, 0, 1, 0, 1, 0}; + vint16m1_t mask_vec = + vle16_v_i16m1((int16_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vint16m1_t zero = vmv_v_x_i16m1(0, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vbool16_t mask = + vmsgt_vv_i16m1_b16(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vuint16m1_t index0 = vundefined_u16m1(); + vuint16m1_t index1 = vundefined_u16m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint16_t idx_num0[8] = {0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7}; + uint16_t idx_num1[8] = {0x4, 0x0, 0x5, 0x1, 0x6, 0x2, 0x7, 0x3}; + index0 = vle16_v_u16m1((uint16_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + index1 = vle16_v_u16m1((uint16_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); +#else + uint16_t* index_p0 = (uint16_t*)&index0; + uint16_t* index_p1 = (uint16_t*)&index1; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(uint16_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint16_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + + Vector0 = vrgather_vv_i16m1(Vector0, index0, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + Vector1 = vrgather_vv_i16m1(Vector1, index1, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + return vmerge_vvm_i16m1(mask, Vector1, Vector0, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[2 * i] = Vector0[i]; + ret[2 * i + 1] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiZipV1Int16(GI_INT16_t Vector0, GI_INT16_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s16(Vector0, Vector1).val[1]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpackhi_epi16(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int16_t mask_idx[8] = {0, 1, 0, 1, 0, 1, 0, 1}; + vint16m1_t mask_vec = + vle16_v_i16m1((int16_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vint16m1_t zero = vmv_v_x_i16m1(0, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vbool16_t mask = + vmsgt_vv_i16m1_b16(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vuint16m1_t index0 = vundefined_u16m1(); + vuint16m1_t index1 = vundefined_u16m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint16_t idx_num0[8] = {0x0, 0x4, 0x1, 0x5, 0x2, 0x6, 0x3, 0x7}; + uint16_t idx_num1[8] = {0x4, 0x0, 0x5, 0x1, 0x6, 0x2, 0x7, 0x3}; + index0 = vle16_v_u16m1((uint16_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + index1 = vle16_v_u16m1((uint16_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); +#else + uint16_t* index_p0 = (uint16_t*)&index0; + uint16_t* index_p1 = (uint16_t*)&index1; + int32_t offset = GI_SIMD_LEN_BYTE / sizeof(uint16_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint16_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + Vector0 = vrgather_vv_i16m1(Vector0, index1, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + Vector1 = vrgather_vv_i16m1(Vector1, index0, GI_SIMD_LEN_BYTE / sizeof(uint16_t)); + return vmerge_vvm_i16m1(mask, Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); ++i) { + ret[2 * i] = Vector0[i + GI_SIMD_LEN_BYTE / sizeof(int32_t)]; + ret[2 * i + 1] = Vector1[i + GI_SIMD_LEN_BYTE / sizeof(int32_t)]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiReinterpretInt16AsInt32(GI_INT16_t In) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s32_s16(In); +#elif defined(GI_SSE2_INTRINSICS) + return (GI_INT32_t)In; +#elif defined(GI_RVV_INTRINSICS) + return vreinterpret_v_i16m1_i32m1(In); +#else + return (GI_INT32_t)In; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiZipV0Int32(GI_INT32_t Vector0, GI_INT32_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s32(Vector0, Vector1).val[0]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi32(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int32_t mask_idx[4] = {1, 0, 1, 0}; + vint32m1_t mask_vec = + vle32_v_i32m1((int32_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vint32m1_t zero = vmv_v_x_i32m1(0, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vbool32_t mask = + vmsgt_vv_i32m1_b32(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vuint32m1_t index0 = vundefined_u32m1(); + vuint32m1_t index1 = vundefined_u32m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint32_t idx_num0[4] = {0x0, 0x2, 0x1, 0x3}; + uint32_t idx_num1[4] = {0x2, 0x0, 0x3, 0x1}; + index0 = vle32_v_u32m1((uint32_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + index1 = vle32_v_u32m1((uint32_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); +#else + uint32_t* index_p0 = (uint32_t*)&index0; + uint32_t* index_p1 = (uint32_t*)&index1; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(uint32_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint32_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + Vector0 = vrgather_vv_i32m1(Vector0, index0, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + Vector1 = vrgather_vv_i32m1(Vector1, index1, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + return vmerge_vvm_i32m1(mask, Vector1, Vector0, GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int64_t); i++) { + ret[2 * i] = Vector0[i]; + ret[2 * i + 1] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiZipV1Int32(GI_INT32_t Vector0, GI_INT32_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vzipq_s32(Vector0, Vector1).val[1]; +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpackhi_epi32(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + int32_t mask_idx[4] = {0, 1, 0, 1}; + vint32m1_t mask_vec = + vle32_v_i32m1((int32_t*)mask_idx, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vint32m1_t zero = vmv_v_x_i32m1(0, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vbool32_t mask = + vmsgt_vv_i32m1_b32(mask_vec, zero, GI_SIMD_LEN_BYTE / sizeof(int32_t)); + vuint32m1_t index0 = vundefined_u32m1(); + vuint32m1_t index1 = vundefined_u32m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint32_t idx_num0[4] = {0x0, 0x2, 0x1, 0x3}; + uint32_t idx_num1[4] = {0x2, 0x0, 0x3, 0x1}; + index0 = vle32_v_u32m1((uint32_t*)idx_num0, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + index1 = vle32_v_u32m1((uint32_t*)idx_num1, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); +#else + uint32_t* index_p0 = (uint32_t*)&index0; + uint32_t* index_p1 = (uint32_t*)&index1; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(uint32_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint32_t) / 2; i++) { + index_p0[2 * i] = i; + index_p0[2 * i + 1] = i + offset; + index_p1[2 * i] = i + offset; + index_p1[2 * i + 1] = i; + } +#endif + Vector0 = vrgather_vv_i32m1(Vector0, index1, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + Vector1 = vrgather_vv_i32m1(Vector1, index0, GI_SIMD_LEN_BYTE / sizeof(uint32_t)); + return vmerge_vvm_i32m1(mask, Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int64_t); i++) { + ret[2 * i] = Vector0[i + GI_SIMD_LEN_BYTE / sizeof(int64_t)]; + ret[2 * i + 1] = Vector1[i + GI_SIMD_LEN_BYTE / sizeof(int64_t)]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiCombineInt32Low(GI_INT32_t Vector0, GI_INT32_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vcombine_s32(vget_low_s32(Vector0), vget_low_s32(Vector1)); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpacklo_epi64(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + return vslideup_vx_i32m1( + Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int64_t); i++) { + ret[i] = Vector0[i]; + ret[i + GI_SIMD_LEN_BYTE / sizeof(int64_t)] = Vector1[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiCombineInt32High(GI_INT32_t Vector0, GI_INT32_t Vector1) { +#if defined(GI_NEON_INTRINSICS) + return vcombine_s32(vget_high_s32(Vector0), vget_high_s32(Vector1)); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_unpackhi_epi64(Vector0, Vector1); +#elif defined(GI_RVV_INTRINSICS) + Vector0 = vslidedown_vx_i32m1( + Vector0, Vector0, GI_SIMD_LEN_BYTE / sizeof(int32_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); + Vector1 = vslidedown_vx_i32m1( + Vector1, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); + return vslideup_vx_i32m1( + Vector0, Vector1, GI_SIMD_LEN_BYTE / sizeof(int32_t) / 2, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int64_t); i++) { + ret[i] = Vector0[i + GI_SIMD_LEN_BYTE / sizeof(int64_t)]; + ret[i + GI_SIMD_LEN_BYTE / sizeof(int64_t)] = + Vector1[i + +GI_SIMD_LEN_BYTE / sizeof(int64_t)]; + } + return ret; +#endif +} + +GI_FORCEINLINE +void GiStoreZipInt8V3(void* Buffer, GI_INT8_t a, GI_INT8_t b, GI_INT8_t c) { +#if defined(GI_NEON_INTRINSICS) + int8x16x3_t vec; + vec.val[0] = a; + vec.val[1] = b; + vec.val[2] = c; + vst3q_s8((int8_t*)Buffer, vec); +#elif defined(GI_SSE2_INTRINSICS) + __m128i z = _mm_setzero_si128(); + __m128i ab0 = _mm_unpacklo_epi8(a, b); + __m128i ab1 = _mm_unpackhi_epi8(a, b); + __m128i c0 = _mm_unpacklo_epi8(c, z); + __m128i c1 = _mm_unpackhi_epi8(c, z); + + __m128i p00 = _mm_unpacklo_epi16(ab0, c0); + __m128i p01 = _mm_unpackhi_epi16(ab0, c0); + __m128i p02 = _mm_unpacklo_epi16(ab1, c1); + __m128i p03 = _mm_unpackhi_epi16(ab1, c1); + + __m128i p10 = _mm_unpacklo_epi32(p00, p01); + __m128i p11 = _mm_unpackhi_epi32(p00, p01); + __m128i p12 = _mm_unpacklo_epi32(p02, p03); + __m128i p13 = _mm_unpackhi_epi32(p02, p03); + + __m128i p20 = _mm_unpacklo_epi64(p10, p11); + __m128i p21 = _mm_unpackhi_epi64(p10, p11); + __m128i p22 = _mm_unpacklo_epi64(p12, p13); + __m128i p23 = _mm_unpackhi_epi64(p12, p13); + + p20 = _mm_slli_si128(p20, 1); + p22 = _mm_slli_si128(p22, 1); + + __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8); + __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8); + __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8); + __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8); + + __m128i p40 = _mm_unpacklo_epi64(p30, p31); + __m128i p41 = _mm_unpackhi_epi64(p30, p31); + __m128i p42 = _mm_unpacklo_epi64(p32, p33); + __m128i p43 = _mm_unpackhi_epi64(p32, p33); + + __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10)); + __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6)); + __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2)); + + _mm_storeu_si128((__m128i*)(Buffer), v0); + _mm_storeu_si128((__m128i*)((int8_t*)Buffer + 16), v1); + _mm_storeu_si128((__m128i*)((int8_t*)Buffer + 32), v2); + +#elif defined(GI_RVV_INTRINSICS) + vsseg3e8_v_i8m1((int8_t*)Buffer, a, b, c, GI_SIMD_LEN_BYTE / sizeof(int8_t)); +#else + size_t i, i3; + for (i = i3 = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++, i3 += 3) { + *((int8_t*)Buffer + i3) = a[i]; + *((int8_t*)Buffer + i3 + 1) = b[i]; + *((int8_t*)Buffer + i3 + 2) = c[i]; + } +#endif +} + +#if defined(GI_NEON_INTRINSICS) +#define GiShiftRightInt32(Vector, n) vshrq_n_s32(Vector, n) +#elif defined(GI_SSE2_INTRINSICS) +#define GiShiftRightInt32(Vector, n) _mm_srai_epi32(Vector, n) +#elif defined(GI_RVV_INTRINSICS) +#define GiShiftRightInt32(Vector, n) \ + vsra_vx_i32m1(Vector, n, GI_SIMD_LEN_BYTE / sizeof(int32_t)) +#else +GI_INT32_t ShiftRightNaive(GI_INT32_t src, const size_t shift) { + GI_INT32_t ret; + for (size_t idx = 0; idx < GI_SIMD_LEN_BYTE / sizeof(int32_t); ++idx) { + ret[idx] = src[idx] >> shift; + } + return ret; +} +#define GiShiftRightInt32(Vector, n) ShiftRightNaive(Vector, n) + +#endif + +#if defined(GI_NEON_INTRINSICS) +#define GiShiftLeftInt32(Vector, n) vshlq_n_s32(Vector, n) +#elif defined(GI_SSE2_INTRINSICS) +#define GiShiftLeftInt32(Vector, n) _mm_slli_epi32(Vector, n) +#elif defined(GI_RVV_INTRINSICS) +#define GiShiftLeftInt32(Vector, n) \ + vsll_vx_i32m1(Vector, n, GI_SIMD_LEN_BYTE / sizeof(int32_t)) +#else +GI_INT32_t ShiftLeftNaive(GI_INT32_t src, const size_t shift) { + GI_INT32_t ret; + for (size_t idx = 0; idx < GI_SIMD_LEN_BYTE / sizeof(int32_t); ++idx) { + ret[idx] = src[idx] << shift; + } + return ret; +} +#define GiShiftLeftInt32(Vector, n) ShiftLeftNaive(Vector, n) +#endif + +GI_FORCEINLINE +GI_INT16_t GiBroadcastInt16(int16_t Value) { +#if defined(GI_NEON_INTRINSICS) + return vdupq_n_s16(Value); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_set1_epi16(Value); +#elif defined(GI_RVV_INTRINSICS) + return vmv_v_x_i16m1(Value, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Value; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiAndInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vandq_s16(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_and_si128(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vand_vv_i16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Vector1[i] & Vector2[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiSubtractInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vsubq_s16(Vector1, Vector2); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_sub_epi16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vsub_vv_i16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Vector1[i] - Vector2[i]; + } + return ret; +#endif +} +GI_FORCEINLINE +GI_INT16_t GiCvtInt32ToInt16(GI_INT32_t Vector) { +#if defined(GI_NEON_INTRINSICS) + int16x4_t vec = vqmovn_s32(Vector); + return vcombine_s16(vec, vec); +#elif defined(GI_SSE2_INTRINSICS) + return _mm_packs_epi32(Vector, Vector); +#elif defined(GI_RVV_INTRINSICS) + vint32m2_t dest = vundefined_i32m2(); + vint32m2_t max = vmv_v_x_i32m2(INT16_MAX, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vint32m2_t min = vmv_v_x_i32m2(INT16_MIN, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + dest = vset_v_i32m1_i32m2(dest, 0, Vector); + dest = vset_v_i32m1_i32m2(dest, 1, Vector); + vbool16_t mask = vmsgt_vv_i32m2_b16(dest, min, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + dest = vmerge_vvm_i32m2(mask, min, dest, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + mask = vmslt_vv_i32m2_b16(dest, max, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + dest = vmerge_vvm_i32m2(mask, max, dest, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vncvt_x_x_w_i16m1(dest, GI_SIMD_LEN_BYTE / sizeof(int16_t)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Saturate(Vector[i], INT16_MIN, INT16_MAX); + ret[i + GI_SIMD_LEN_BYTE / sizeof(int32_t)] = ret[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT8_t GiInterleave4Int8(GI_INT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + uint8x16_t idx = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; + return vqtbl1q_s8(Vector, idx); +#elif defined(GI_SSE2_INTRINSICS) + __m128i src0 = _mm_shufflelo_epi16(Vector, 0xd8); + src0 = _mm_shufflehi_epi16(src0, 0xd8); + + __m128i src1 = _mm_shuffle_epi32(src0, 0xd8); + __m128i src2 = _mm_bsrli_si128(src1, 2); + + __m128i src3 = _mm_unpacklo_epi8(src1, src2); + __m128i src4 = _mm_unpackhi_epi8(src1, src2); + + __m128i src5 = _mm_shuffle_epi32(src3, 0xd8); + __m128i src6 = _mm_shuffle_epi32(src4, 0xd8); + + __m128i src7 = _mm_unpacklo_epi64(src5, src6); + __m128i ans = _mm_shufflelo_epi16(src7, 0xd8); + return _mm_shufflehi_epi16(ans, 0xd8); +#elif defined(GI_RVV_INTRINSICS) + vuint8m1_t index = vundefined_u8m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint8_t idx[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; + index = vle8_v_u8m1((uint8_t*)idx, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + uint8_t* index_p = (uint8_t*)&index; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 4; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 4; i++) { + index_p[i] = 4 * i; + index_p[i + 1 * offset] = 4 * i + 1; + index_p[i + 2 * offset] = 4 * i + 2; + index_p[i + 3 * offset] = 4 * i + 3; + } +#endif + + return vrgather_vv_i8m1(Vector, index, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_INT8_t ret; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(int32_t); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = Vector[i * 4 + 0]; + ret[i + 1 * offset] = Vector[i * 4 + 1]; + ret[i + 2 * offset] = Vector[i * 4 + 2]; + ret[i + 3 * offset] = Vector[i * 4 + 3]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiCvtUint8toInt16Low(GI_UINT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(Vector))); +#elif defined(GI_SSE2_INTRINSICS) + __m128i sign_mask = _mm_setzero_si128(); + return _mm_unpacklo_epi8(Vector, sign_mask); +#elif defined(GI_RVV_INTRINSICS) + vuint16m2_t vec = vwcvtu_x_x_v_u16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + return vreinterpret_v_u16m1_i16m1(vget_v_u16m2_u16m1(vec, 0)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = (int16_t)Vector[i]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT16_t GiCvtUint8toInt16High(GI_UINT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + return vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(Vector))); +#elif defined(GI_SSE2_INTRINSICS) + __m128i sign_mask = _mm_setzero_si128(); + return _mm_unpackhi_epi8(Vector, sign_mask); +#elif defined(GI_RVV_INTRINSICS) + vuint16m2_t vec = vwcvtu_x_x_v_u16m2(Vector, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + return vreinterpret_v_u16m1_i16m1(vget_v_u16m2_u16m1(vec, 1)); +#else + GI_INT16_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = (int16_t)Vector[i + GI_SIMD_LEN_BYTE / sizeof(int16_t)]; + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiMultiplyAddInt16LongLow( + GI_INT32_t Vector0, GI_INT16_t Vector1, GI_INT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmlal_s16(Vector0, vget_low_s16(Vector1), vget_low_s16(Vector2)); +#elif defined(GI_SSE2_INTRINSICS) + __m128i lo = _mm_mullo_epi16(Vector1, Vector2); + __m128i hi = _mm_mulhi_epi16(Vector1, Vector2); + return _mm_add_epi32(Vector0, _mm_unpacklo_epi16(lo, hi)); +#elif defined(GI_RVV_INTRINSICS) + vint32m2_t vec1 = vwcvt_x_x_v_i32m2(Vector1, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vint32m2_t vec2 = vwcvt_x_x_v_i32m2(Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vmadd_vv_i32m1( + vget_v_i32m2_i32m1(vec1, 0), vget_v_i32m2_i32m1(vec2, 0), Vector0, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + ret[i] = (int32_t)Vector0[i] + (int32_t)Vector1[i] * (int32_t)(Vector2[i]); + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_INT32_t GiMultiplyAddInt16LongHigh( + GI_INT32_t Vector0, GI_INT16_t Vector1, GI_INT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmlal_s16(Vector0, vget_high_s16(Vector1), vget_high_s16(Vector2)); +#elif defined(GI_SSE2_INTRINSICS) + __m128i lo = _mm_mullo_epi16(Vector1, Vector2); + __m128i hi = _mm_mulhi_epi16(Vector1, Vector2); + return _mm_add_epi32(Vector0, _mm_unpackhi_epi16(lo, hi)); +#elif defined(GI_RVV_INTRINSICS) + vint32m2_t vec1 = vwcvt_x_x_v_i32m2(Vector1, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + vint32m2_t vec2 = vwcvt_x_x_v_i32m2(Vector2, GI_SIMD_LEN_BYTE / sizeof(int16_t)); + return vmadd_vv_i32m1( + vget_v_i32m2_i32m1(vec1, 1), vget_v_i32m2_i32m1(vec2, 1), Vector0, + GI_SIMD_LEN_BYTE / sizeof(int32_t)); + +#else + GI_INT32_t ret; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { + size_t idx = GI_SIMD_LEN_BYTE / sizeof(int32_t) + i; + ret[i] = (int32_t)Vector0[i] + (int32_t)Vector1[idx] * (int32_t)(Vector2[idx]); + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiCvtFromInt32V4ToUint8( + GI_INT32_t Vector0, GI_INT32_t Vector1, GI_INT32_t Vector2, + GI_INT32_t Vector3) { +#if defined(GI_NEON_INTRINSICS) + int16x8_t mid1 = vmaxq_s16( + vdupq_n_s16(0), + vminq_s16( + vcombine_s16(vqmovn_s32(Vector0), vqmovn_s32(Vector1)), + vdupq_n_s16(UINT8_MAX))); + int16x8_t mid2 = vmaxq_s16( + vdupq_n_s16(0), + vminq_s16( + vcombine_s16(vqmovn_s32(Vector2), vqmovn_s32(Vector3)), + vdupq_n_s16(UINT8_MAX))); + return vcombine_u8( + vqmovn_u16(vreinterpretq_u16_s16(mid1)), + vqmovn_u16(vreinterpretq_u16_s16(mid2))); +#elif defined(GI_SSE2_INTRINSICS) + __m128i vepi16_0 = _mm_packs_epi32(Vector0, Vector1); + __m128i vepi16_1 = _mm_packs_epi32(Vector2, Vector3); + return _mm_packus_epi16(vepi16_0, vepi16_1); +#elif defined(GI_RVV_INTRINSICS) + vint32m4_t dest = vundefined_i32m4(); + dest = vset_v_i32m1_i32m4(dest, 0, Vector0); + dest = vset_v_i32m1_i32m4(dest, 1, Vector1); + dest = vset_v_i32m1_i32m4(dest, 2, Vector2); + dest = vset_v_i32m1_i32m4(dest, 3, Vector3); + vint32m4_t max = vmv_v_x_i32m4(UINT8_MAX, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + vint32m4_t min = vmv_v_x_i32m4(0, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + vbool8_t mask = vmsgt_vv_i32m4_b8(dest, min, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + dest = vmerge_vvm_i32m4(mask, min, dest, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + mask = vmslt_vv_i32m4_b8(dest, max, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + dest = vmerge_vvm_i32m4(mask, max, dest, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); + vuint16m2_t ans16 = vreinterpret_v_i16m2_u16m2( + vncvt_x_x_w_i16m2(dest, GI_SIMD_LEN_BYTE / sizeof(uint8_t))); + return vncvt_x_x_w_u8m1(ans16, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + size_t length = GI_SIMD_LEN_BYTE / sizeof(int32_t); + for (size_t i = 0; i < length; i++) { + ret[i] = Saturate(Vector0[i], 0, UINT8_MAX); + ret[i + length] = Saturate(Vector1[i], 0, UINT8_MAX); + ret[i + length * 2] = Saturate(Vector2[i], 0, UINT8_MAX); + ret[i + length * 3] = Saturate(Vector3[i], 0, UINT8_MAX); + } + return ret; +#endif +} + +GI_FORCEINLINE +GI_UINT8_t GiInterleave2Uint8(GI_UINT8_t Vector) { +#if defined(GI_NEON_INTRINSICS) + uint8x16_t idx = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + return vreinterpretq_u8_s8(vqtbl1q_s8(vreinterpretq_s8_u8(Vector), idx)); +#elif defined(GI_SSE2_INTRINSICS) + __m128i src1 = Vector; + __m128i src2 = _mm_bsrli_si128(src1, 2); + + __m128i src3 = _mm_unpacklo_epi8(src1, src2); + __m128i src4 = _mm_unpackhi_epi8(src1, src2); + + __m128i src5 = _mm_shuffle_epi32(src3, 0xd8); + __m128i src6 = _mm_shuffle_epi32(src4, 0xd8); + + __m128i src7 = _mm_shufflelo_epi16(src5, 0xd8); + __m128i src8 = _mm_shufflelo_epi16(src6, 0xd8); + return _mm_unpacklo_epi32(src7, src8); +#elif defined(GI_RVV_INTRINSICS) + vuint8m1_t index = vundefined_u8m1(); +#if GI_SIMD_LEN_BYTE == 16 + uint8_t idx[16] = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; + index = vle8_v_u8m1((uint8_t*)idx, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + uint8_t* index_p = (uint8_t*)&index; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(uint8_t) / 2; i++) { + index_p[i] = 2 * i; + index_p[i + offset] = 2 * i + 1; + } +#endif + + return vrgather_vv_u8m1(Vector, index, GI_SIMD_LEN_BYTE / sizeof(uint8_t)); +#else + GI_UINT8_t ret; + size_t offset = GI_SIMD_LEN_BYTE / sizeof(int16_t); + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { + ret[i] = Vector[2 * i]; + ret[i + offset] = Vector[2 * i + 1]; + } + return ret; +#endif +} + // vim: syntax=cpp.doxygen diff --git a/dnn/test/fallback/gi.cpp b/dnn/test/fallback/gi.cpp index 2243fded8f10d9e5e4bba9980a8a7eead2de19f5..7599516c60d3d08848fc5c0f7693c0bbb51cf1a5 100644 --- a/dnn/test/fallback/gi.cpp +++ b/dnn/test/fallback/gi.cpp @@ -2122,7 +2122,7 @@ TEST_F(FALLBACK, GiBSLFloat32) { #if defined(GI_RVV_INTRINSICS) vuint32m1_t mask = vundefined_u32m1(); #else - GI_UINT32_t mask; + GI_UINT32_t mask = {0u, 0u}; #endif std::vector s0{1.1f, 2.2f, 4.5f, 4.9f}; std::vector s1{2312.1f, 345.244f, 3.59f, -12.8f}; @@ -2752,7 +2752,7 @@ TEST_F(FALLBACK, GiStoreLowInt8) { assert_eq(ret.data(), s0, SIMD_LEN_8 / 2); } -TEST_F(FALLBACK, GiStoreHihgInt8) { +TEST_F(FALLBACK, GiStoreHighInt8) { GI_INT8_t src0; std::vector s0{127, 2, 56, -128, 1, 2, 3, 4, 127, 2, 56, -128, 1, 2, 3, 4}; s0.resize(SIMD_LEN_8); @@ -2760,7 +2760,7 @@ TEST_F(FALLBACK, GiStoreHihgInt8) { std::vector ret{0}; ret.resize(SIMD_LEN_8 / 2); - GiStoreHihgInt8(ret.data(), src0); + GiStoreHighInt8(ret.data(), src0); std::vector naive; for (size_t i = 0; i < SIMD_LEN_8 / 2; i++) { @@ -4360,6 +4360,656 @@ TEST_F(FALLBACK, GiDivFloat32) { assert_lt((float*)&ret, naive, 1e-3); } +TEST_F(FALLBACK, GiLoadUint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUint8(s0.data()); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[i]); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiReverseUint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUint8(s0.data()); + ret = GiReverseUint8(ret); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[SIMD_LEN_8 - i - 1]); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiStoreUint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255}; + GI_UINT8_t src; + std::vector ret; + ret.resize(SIMD_LEN_8); + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE); + src = GiLoadUint8(s0.data()); + GiStoreUint8(ret.data(), src); + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[i]); + } + + assert_eq(ret.data(), naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiLoadUzip0V3Uint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 255}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUzip0V3Uint8(s0.data()); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[i * 3]); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiLoadUzip1V3Uint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 255}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUzip1V3Uint8(s0.data()); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[i * 3 + 1]); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiLoadUzip2V3Uint8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 255}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUzip2V3Uint8(s0.data()); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8; i++) { + naive.push_back(s0[i * 3 + 2]); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiStoreZipUint8V3) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 255}; + GI_UINT8_t src0, src1, src2; + std::vector ret; + ret.resize(SIMD_LEN_8 * 3); + + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src2, GI_SIMD_LEN_BYTE); + src0 = GiLoadUzip0V3Uint8(s0.data()); + src1 = GiLoadUzip1V3Uint8(s0.data()); + src2 = GiLoadUzip2V3Uint8(s0.data()); + + GiStoreZipUint8V3(ret.data(), src0, src1, src2); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8 * 3; i++) { + naive.push_back(s0[i]); + } + assert_eq(ret.data(), naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiShiftRightInt16ToUint8) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + GI_INT16_t src; + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE); + src = GiLoadInt16(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_8); + GI_UINT8_t ret; +#define TEST_BLOCK(shift) \ + ret = GiShiftRightInt16ToUint8(src, shift); \ + for (size_t i = 0; i < SIMD_LEN_16; i++) { \ + uint8_t val = Saturate(s0[i] >> shift, 0, UINT8_MAX); \ + naive[i] = val; \ + naive[i + SIMD_LEN_16] = val; \ + } \ + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); + + TEST_BLOCK(1); + TEST_BLOCK(2); + TEST_BLOCK(3); + TEST_BLOCK(4); + TEST_BLOCK(5); + TEST_BLOCK(6); + TEST_BLOCK(7); + TEST_BLOCK(8); +#undef TEST_BLOCK +} + +TEST_F(FALLBACK, GiCombineInt16Low) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + std::vector s1{1, 2, 3, -4, 5, -6, 7, -8}; + GI_INT16_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s1.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret = GiCombineInt16Low(src0, src1); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[i] = s0[i]; + naive[i + SIMD_LEN] = s1[i]; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiCombineUint8Low) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_UINT8_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadUint8(s0.data()); + src1 = GiLoadUint8(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_8); + GI_UINT8_t ret = GiCombineUint8Low(src0, src1); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[i]; + naive[i + SIMD_LEN_16] = s0[i]; + } + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiZipV0Int8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_INT8_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt8(s0.data()); + src1 = GiLoadInt8(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_8); + GI_INT8_t ret = GiZipV0Int8(src0, src1); + for (size_t i = 0; i < SIMD_LEN_16; ++i) { + naive[2 * i] = s0[i]; + naive[2 * i + 1] = s0[i]; + } + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiZipV1Int8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_INT8_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt8(s0.data()); + src1 = GiLoadInt8(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_8); + GI_INT8_t ret = GiZipV1Int8(src0, src1); + for (size_t i = 0; i < SIMD_LEN_16; ++i) { + naive[2 * i] = s0[i + SIMD_LEN_16]; + naive[2 * i + 1] = s0[i + SIMD_LEN_16]; + } + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiReinterpretInt8AsInt16) { + GI_INT8_t src0; + GI_INT16_t ret, naive; + std::vector s0{1, 2, -2, -1, INT8_MAX, INT8_MIN, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14}; + s0.resize(SIMD_LEN); + init((int8_t*)&src0, s0); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiReinterpretInt8AsInt16(src0); + memcpy(&naive, &src0, GI_SIMD_LEN_BYTE); + + ASSERT_FALSE(memcmp(&ret, &naive, GI_SIMD_LEN_BYTE)); +} +TEST_F(FALLBACK, GiZipV0Int16) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + GI_INT16_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret = GiZipV0Int16(src0, src1); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[2 * i] = s0[i]; + naive[2 * i + 1] = s0[i]; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiZipV1Int16) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + GI_INT16_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret = GiZipV1Int16(src0, src1); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[2 * i] = s0[i + SIMD_LEN]; + naive[2 * i + 1] = s0[i + SIMD_LEN]; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiReinterpretInt16AsInt32) { + GI_INT16_t src0; + GI_INT32_t ret, naive; + std::vector s0{1, 2, -2, -1, INT16_MAX, INT16_MIN, 5, 6}; + s0.resize(SIMD_LEN); + init((int16_t*)&src0, s0); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiReinterpretInt16AsInt32(src0); + memcpy(&naive, &src0, GI_SIMD_LEN_BYTE); + + ASSERT_FALSE(memcmp(&ret, &naive, GI_SIMD_LEN_BYTE)); +} +TEST_F(FALLBACK, GiZipV0Int32) { + std::vector s0{INT32_MAX, INT32_MIN, 0x00005678, -0x00005678}; + GI_INT32_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt32(s0.data()); + src1 = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiZipV0Int32(src0, src1); + for (size_t i = 0; i < SIMD_LEN / 2; i++) { + naive[2 * i] = s0[i]; + naive[2 * i + 1] = s0[i]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} +TEST_F(FALLBACK, GiZipV1Int32) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678}; + GI_INT32_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt32(s0.data()); + src1 = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiZipV1Int32(src0, src1); + for (size_t i = 0; i < SIMD_LEN / 2; i++) { + naive[2 * i] = s0[i + SIMD_LEN / 2]; + naive[2 * i + 1] = s0[i + SIMD_LEN / 2]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} +TEST_F(FALLBACK, GiCombineInt32Low) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678}; + GI_INT32_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt32(s0.data()); + src1 = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiCombineInt32Low(src0, src1); + for (size_t i = 0; i < SIMD_LEN / 2; i++) { + naive[i] = s0[i]; + naive[i + SIMD_LEN / 2] = s0[i]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} +TEST_F(FALLBACK, GiCombineInt32High) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678}; + GI_INT32_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt32(s0.data()); + src1 = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiCombineInt32High(src0, src1); + for (size_t i = 0; i < SIMD_LEN / 2; i++) { + naive[i] = s0[i + SIMD_LEN / 2]; + naive[i + SIMD_LEN / 2] = s0[i + SIMD_LEN / 2]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} + +TEST_F(FALLBACK, GiStoreZipInt8V3) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 127}; + GI_INT8_t src0, src1, src2; + GI_INT8_V3_t src; + std::vector ret; + ret.resize(SIMD_LEN_8 * 3); + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE * 3); + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src2, GI_SIMD_LEN_BYTE); + src = GiLoadUzipInt8V3(s0.data()); + src0 = GiGetSubVectorInt8V3(src, 0); + src1 = GiGetSubVectorInt8V3(src, 1); + src2 = GiGetSubVectorInt8V3(src, 2); + + GiStoreZipInt8V3(ret.data(), src0, src1, src2); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_8 * 3; i++) { + naive.push_back(s0[i]); + } + assert_eq(ret.data(), naive, SIMD_LEN_8); +} + +TEST_F(FALLBACK, GiShiftRightInt32) { + std::vector s0{INT32_MAX, INT32_MIN, 0x12345678, -0x12345678}; + GI_INT32_t src; + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE); + src = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret; +#define TEST_BLOCK(shift) \ + ret = GiShiftRightInt32(src, shift); \ + for (size_t i = 0; i < SIMD_LEN; i++) { \ + naive[i] = s0[i] >> shift; \ + } \ + assert_eq((int32_t*)&ret, naive, SIMD_LEN); + + TEST_BLOCK(1); + TEST_BLOCK(2); + TEST_BLOCK(3); + TEST_BLOCK(4); + TEST_BLOCK(5); + TEST_BLOCK(6); + TEST_BLOCK(7); + TEST_BLOCK(8); + TEST_BLOCK(9); + TEST_BLOCK(10); + TEST_BLOCK(11); + TEST_BLOCK(12); + TEST_BLOCK(13); + TEST_BLOCK(14); + TEST_BLOCK(15); + TEST_BLOCK(16); + +#undef TEST_BLOCK +} +TEST_F(FALLBACK, GiShiftLeftInt32) { + std::vector s0{INT32_MAX, INT32_MIN, 0x12345678, -0x12345678}; + GI_INT32_t src; + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE); + src = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret; +#define TEST_BLOCK(shift) \ + ret = GiShiftLeftInt32(src, shift); \ + for (size_t i = 0; i < SIMD_LEN; i++) { \ + naive[i] = s0[i] << shift; \ + } \ + assert_eq((int32_t*)&ret, naive, SIMD_LEN); + + TEST_BLOCK(1); + TEST_BLOCK(2); + TEST_BLOCK(3); + TEST_BLOCK(4); + TEST_BLOCK(5); + TEST_BLOCK(6); + TEST_BLOCK(7); + TEST_BLOCK(8); + TEST_BLOCK(9); + TEST_BLOCK(10); + TEST_BLOCK(11); + TEST_BLOCK(12); + TEST_BLOCK(13); + TEST_BLOCK(14); + TEST_BLOCK(15); + TEST_BLOCK(16); + +#undef TEST_BLOCK +} + +TEST_F(FALLBACK, GiBroadcastInt16) { + int16_t src0 = 5; + GI_INT16_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiBroadcastInt16(src0); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive.push_back(src0); + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiAndInt16) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + GI_INT16_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret = GiAndInt16(src0, src1); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[i] & s0[i]; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiCvtInt32ToInt16) { + std::vector s0{INT32_MAX, INT32_MIN, 0x12345678, -0x12345678}; + GI_INT32_t src; + force_memset_ret((void*)&src, GI_SIMD_LEN_BYTE); + src = GiLoadInt32(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret; + ret = GiCvtInt32ToInt16(src); + for (size_t i = 0; i < SIMD_LEN; i++) { + int16_t val = Saturate(s0[i], INT16_MIN, INT16_MAX); + naive[i] = val; + naive[i + SIMD_LEN] = val; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} + +TEST_F(FALLBACK, GiInterleave4Int8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_INT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadInt8(s0.data()); + ret = GiInterleave4Int8(ret); + + std::vector naive; + naive.resize(SIMD_LEN_8); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[i] = s0[i * 4]; + naive[i + 4] = s0[i * 4 + 1]; + naive[i + 2 * 4] = s0[i * 4 + 2]; + naive[i + 3 * 4] = s0[i * 4 + 3]; + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiCvtUint8toInt16Low) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127}; + GI_INT16_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + GI_UINT8_t src = GiLoadUint8(s0.data()); + ret = GiCvtUint8toInt16Low(src); + std::vector naive; + naive.resize(SIMD_LEN_16); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[i]; + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiCvtUint8toInt16High) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127}; + GI_INT16_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + GI_UINT8_t src = GiLoadUint8(s0.data()); + ret = GiCvtUint8toInt16High(src); + std::vector naive; + naive.resize(SIMD_LEN_16); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[i + SIMD_LEN_16]; + } + + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} +TEST_F(FALLBACK, GiMultiplyAddInt16LongLow) { + GI_INT16_t src0, src1; + GI_INT32_t src2; + std::vector s1{1, 2, 3, 4}; + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src2, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + src2 = GiLoadInt32(s1.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiMultiplyAddInt16LongLow(src2, src0, src1); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[i] = (int32_t)s1[i] + (int32_t)s0[i] * (int32_t)s0[i]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} +TEST_F(FALLBACK, GiMultiplyAddInt16LongHigh) { + GI_INT16_t src0, src1; + GI_INT32_t src2; + std::vector s1{1, 2, 3, 4}; + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src2, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + src2 = GiLoadInt32(s1.data()); + + std::vector naive; + naive.resize(SIMD_LEN); + GI_INT32_t ret = GiMultiplyAddInt16LongHigh(src2, src0, src1); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[i] = + (int32_t)s1[i] + (int32_t)s0[i + SIMD_LEN] * (int32_t)s0[i + SIMD_LEN]; + } + assert_eq((int32_t*)&ret, naive, SIMD_LEN); +} +TEST_F(FALLBACK, GiCvtFromInt32V4ToUint8) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678}; + GI_INT32_t src0, src1, src2, src3; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt32(s0.data()); + src1 = GiLoadInt32(s0.data()); + src2 = GiLoadInt32(s0.data()); + src3 = GiLoadInt32(s0.data()); + GI_UINT8_t ret = GiCvtFromInt32V4ToUint8(src0, src1, src2, src3); + std::vector naive; + naive.resize(SIMD_LEN_8); + for (size_t i = 0; i < SIMD_LEN; i++) { + naive[i] = Saturate(s0[i], 0, UINT8_MAX); + naive[i + SIMD_LEN] = Saturate(s0[i], 0, UINT8_MAX); + naive[i + 2 * SIMD_LEN] = Saturate(s0[i], 0, UINT8_MAX); + naive[i + 3 * SIMD_LEN] = Saturate(s0[i], 0, UINT8_MAX); + } + + assert_eq((uint8_t*)&ret, naive, SIMD_LEN_8); +} +TEST_F(FALLBACK, GiSubtractInt16) { + std::vector s0{INT16_MAX, INT16_MIN, 0x00005678, -0x00005678, + 0x00001234, -0x00001234, 0x00000fff, -0x00000fff}; + GI_INT16_t src0, src1; + force_memset_ret((void*)&src0, GI_SIMD_LEN_BYTE); + force_memset_ret((void*)&src1, GI_SIMD_LEN_BYTE); + src0 = GiLoadInt16(s0.data()); + src1 = GiLoadInt16(s0.data()); + + std::vector naive; + naive.resize(SIMD_LEN_16); + GI_INT16_t ret = GiSubtractInt16(src0, src1); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[i] - s0[i]; + } + assert_eq((int16_t*)&ret, naive, SIMD_LEN_16); +} + +TEST_F(FALLBACK, GiInterleave2UInt8) { + std::vector s0{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + GI_UINT8_t ret; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadUint8(s0.data()); + ret = GiInterleave2Uint8(ret); + + std::vector naive; + naive.resize(SIMD_LEN_8); + for (size_t i = 0; i < SIMD_LEN_16; i++) { + naive[i] = s0[2 * i]; + naive[i + SIMD_LEN_16] = s0[2 * i + 1]; + } + + assert_eq((int8_t*)&ret, naive, SIMD_LEN_8); +} + } // namespace test } // namespace megdnn