From 45c0e40f7b55022ae51ec7c1aa0bf0891db3988f Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 31 Jan 2023 17:07:19 +0800 Subject: [PATCH] feat(dnn): add some fallback gi fp16 intrinsic GitOrigin-RevId: 67f091d0b87982452287c0af98b89abf8f065809 --- .../fallback/general_intrinsic/gi_common.h | 31 ++ .../fallback/general_intrinsic/gi_float16.h | 208 +++++++++ dnn/test/fallback/gi.cpp | 427 ++++++++++++++++++ .../riscv64-rvv-linux-gnu.toolchain.cmake | 4 +- 4 files changed, 668 insertions(+), 2 deletions(-) create mode 100644 dnn/src/fallback/general_intrinsic/gi_float16.h diff --git a/dnn/src/fallback/general_intrinsic/gi_common.h b/dnn/src/fallback/general_intrinsic/gi_common.h index 36b715ebd..df076993a 100644 --- a/dnn/src/fallback/general_intrinsic/gi_common.h +++ b/dnn/src/fallback/general_intrinsic/gi_common.h @@ -92,6 +92,13 @@ #undef GI_RVV_INTRINSICS #endif +//! Gi fp16 only support arm64 neon and rvv +#if (defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && \ + MEGDNN_AARCH64) || \ + defined(GI_RVV_INTRINSICS) +#define GI_SUPPORT_F16 +#endif + //! general intrinsic support dynamic length simd, if avx or avx2 the simd //! length is 256 #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ @@ -127,6 +134,12 @@ enum GiSimdType { GI_RVV, }; +#if defined(GI_RVV_INTRINSICS) +typedef float16_t gi_float16_t; +#elif defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +typedef __fp16 gi_float16_t; +#endif + #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ defined(GI_FMA_INTRINSICS) #define __gi_simd_type GI_AVX @@ -139,6 +152,10 @@ typedef __m256i GI_UINT32_t; #elif defined(GI_NEON_INTRINSICS) #define __gi_simd_type GI_NEON typedef float32x4_t GI_FLOAT32_t; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +typedef float16x8_t GI_FLOAT16_t; +typedef float16x8x2_t GI_FLOAT16_V2_t; +#endif typedef uint8x16_t GI_UINT8_t; typedef int8x16_t GI_INT8_t; typedef int16x8_t GI_INT16_t; @@ -287,6 +304,8 @@ typedef __m64_128 float32x2_t; #elif defined(GI_RVV_INTRINSICS) #define __gi_simd_type GI_RVV typedef vfloat32m1_t GI_FLOAT32_t; +typedef vfloat16m1_t GI_FLOAT16_t; +typedef vfloat16m1x2_t GI_FLOAT16_V2_t; typedef vuint8m1_t GI_UINT8_t; typedef vint8m1_t GI_INT8_t; typedef vint16m1_t GI_INT16_t; @@ -423,6 +442,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t; #define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index) #define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index) +#define GiGetSubVectorFloat16V2(s, index) vget_f16m1x2_f16m1(s, index) + #define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index) #define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index) @@ -437,6 +458,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t; #define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s) #define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s) +#define GiSetSubVectorFloat16V2(d, index, s) d = vset_f16m1x2(d, index, s) + #define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s) #define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s) @@ -578,6 +601,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t; #define GiGetSubVectorFloat32V3(s, index) s.val[index] #define GiGetSubVectorFloat32V4(s, index) s.val[index] +#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#define GiGetSubVectorFloat16V2(s, index) s.val[index] +#endif + #define GiGetSubVectorInt32V2(s, index) s.val[index] #define GiGetSubVectorInt32V4(s, index) s.val[index] @@ -592,6 +619,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t; #define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s #define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s +#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#define GiSetSubVectorFloat16V2(d, index, s) d.val[index] = s +#endif + #define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s #define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s diff --git a/dnn/src/fallback/general_intrinsic/gi_float16.h b/dnn/src/fallback/general_intrinsic/gi_float16.h new file mode 100644 index 000000000..f85997208 --- /dev/null +++ b/dnn/src/fallback/general_intrinsic/gi_float16.h @@ -0,0 +1,208 @@ +#pragma once + +#include "gi_common.h" + +#if defined(GI_SUPPORT_F16) + +//! c + b * a +#if defined(GI_NEON_INTRINSICS) +#if defined(__ARM_FEATURE_FMA) +#define v_fma_ps_f16(c, b, a) vfmaq_f16((c), (b), (a)) +#define v_fma_n_f16(c, b, a) vfmaq_n_f16((c), (b), (a)) +#else +#define v_fma_ps_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), (a))) +#define v_fma_n_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), vdupq_n_f16(a))) +#endif +#endif + +GI_FORCEINLINE +GI_FLOAT16_t GiBroadcastFloat16(gi_float16_t Value) { +#if defined(GI_NEON_INTRINSICS) + return vdupq_n_f16(Value); +#elif defined(GI_RVV_INTRINSICS) + return vfmv_v_f_f16m1(Value, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiLoadBroadcastFloat16(const gi_float16_t* Value) { +#if defined(GI_NEON_INTRINSICS) + return vld1q_dup_f16(Value); +#elif defined(GI_RVV_INTRINSICS) + return GiBroadcastFloat16(*Value); +#endif +} + +GI_FORCEINLINE +GI_FLOAT32_V2_t GiCastFloat16ToFloat32(const GI_FLOAT16_t& fp16) { +#if defined(GI_NEON_INTRINSICS) + GI_FLOAT32_V2_t ret; + GiSetSubVectorFloat32V2(ret, 0, vcvt_f32_f16(vget_low_f16(fp16))); + GiSetSubVectorFloat32V2(ret, 1, vcvt_f32_f16(vget_high_f16(fp16))); + return ret; +#elif defined(GI_RVV_INTRINSICS) + GI_FLOAT32_V2_t ret; + vfloat32m2_t tmp = + vfwcvt_f_f_v_f32m2(fp16, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); + GiSetSubVectorFloat32V2(ret, 0, vget_v_f32m2_f32m1(tmp, 0)); + GiSetSubVectorFloat32V2(ret, 1, vget_v_f32m2_f32m1(tmp, 1)); + return ret; +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiCastFloat32ToFloat16(const GI_FLOAT32_t& low, const GI_FLOAT32_t& high) { +#if defined(GI_NEON_INTRINSICS) + return vcombine_f16(vcvt_f16_f32(low), vcvt_f16_f32(high)); +#elif defined(GI_RVV_INTRINSICS) + vfloat32m2_t tmp = vfmv_v_f_f32m2(0.0, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); + tmp = vset_v_f32m1_f32m2(tmp, 0, low); + tmp = vset_v_f32m1_f32m2(tmp, 1, high); + return vfncvt_f_f_w_f16m1(tmp, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiZeroFloat16(void) { + return GiBroadcastFloat16(0.0); +} + +GI_FORCEINLINE +GI_FLOAT16_t GiLoadFloat16(const gi_float16_t* Buffer) { +#if defined(GI_NEON_INTRINSICS) + return vld1q_f16(Buffer); +#elif defined(GI_RVV_INTRINSICS) + return vle16_v_f16m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +// !return a + b * c +GI_FORCEINLINE +GI_FLOAT16_t GiMlaqFloat16(GI_FLOAT16_t a, GI_FLOAT16_t b, GI_FLOAT16_t c) { +#if defined(GI_NEON_INTRINSICS) +#if defined(__ARM_FEATURE_FMA) + return vfmaq_f16(a, b, c); +#else + return vaddq_f16(a, vmulq_f16(b, c)); +#endif +#elif defined(GI_RVV_INTRINSICS) + return vfmadd_vv_f16m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +void GiStoreFloat16(gi_float16_t* Buffer, GI_FLOAT16_t Vector) { +#if defined(GI_NEON_INTRINSICS) + vst1q_f16(Buffer, Vector); +#elif defined(GI_RVV_INTRINSICS) + vse16_v_f16m1(Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiAddFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vaddq_f16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfadd_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiSubtractFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vsubq_f16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfsub_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMultiplyFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmulq_f16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmul_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMultiplyScalerFloat16(GI_FLOAT16_t Vector1, gi_float16_t Scaler) { +#if defined(GI_NEON_INTRINSICS) + return vmulq_n_f16(Vector1, Scaler); +#elif defined(GI_RVV_INTRINSICS) + return vfmul_vf_f16m1(Vector1, Scaler, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMultiplyAddScalarFloat16( + GI_FLOAT16_t VectorSum, GI_FLOAT16_t Vector, gi_float16_t Scalar) { +#if defined(GI_NEON_INTRINSICS) + return v_fma_n_f16(VectorSum, Vector, Scalar); +#elif defined(GI_RVV_INTRINSICS) + return vfmadd_vf_f16m1( + Vector, Scalar, VectorSum, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMultiplySubScalarFloat16( + GI_FLOAT16_t VectorSub, GI_FLOAT16_t Vector, gi_float16_t Scalar) { +#if defined(GI_NEON_INTRINSICS) + return vsubq_f16(VectorSub, vmulq_n_f16(Vector, Scalar)); +#elif defined(GI_RVV_INTRINSICS) + return vfnmsub_vf_f16m1( + Vector, Scalar, VectorSub, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMaximumFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vmaxq_f16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmax_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +GI_FORCEINLINE +GI_FLOAT16_t GiMinimumFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) { +#if defined(GI_NEON_INTRINSICS) + return vminq_f16(Vector1, Vector2); +#elif defined(GI_RVV_INTRINSICS) + return vfmin_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); +#endif +} + +//! a + b * c[d] +#if defined(GI_NEON_INTRINSICS) +#define GiSimdFmaLaneFloat16(a, b, c, d) vfmaq_laneq_f16(a, b, c, d) +#elif defined(GI_RVV_INTRINSICS) +#define __rvv_fmaq_laneq_f16(__a, __b, __c, __lane) \ + __extension__({ \ + gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \ + vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \ + GI_FLOAT16_t __ret = vfmadd_vf_f16m1( \ + __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \ + __ret; \ + }) +#define GiSimdFmaLaneFloat16(a, b, c, d) __rvv_fmaq_laneq_f16(a, b, c, d) +#endif + +//! a - b * v[lane] +#if defined(GI_NEON_INTRINSICS) +#define GiFmsqLaneQFloat16(a, b, v, lane) vfmsq_laneq_f16(a, b, v, lane) +#elif defined(GI_RVV_INTRINSICS) +#define __rvv_fmsq_lane_float16(__a, __b, __c, __lane) \ + __extension__({ \ + gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \ + vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \ + GI_FLOAT16_t __ret = vfnmsub_vf_f16m1( \ + __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \ + __ret; \ + }) +#define GiFmsqLaneQFloat16(a, b, c, d) __rvv_fmsq_lane_float16(a, b, c, d) +#endif + +#endif \ No newline at end of file diff --git a/dnn/test/fallback/gi.cpp b/dnn/test/fallback/gi.cpp index 391f02ce6..544677570 100644 --- a/dnn/test/fallback/gi.cpp +++ b/dnn/test/fallback/gi.cpp @@ -8,6 +8,7 @@ class FALLBACK : public ::testing::Test {}; #endif #include "src/fallback/general_intrinsic/gi_float.h" +#include "src/fallback/general_intrinsic/gi_float16.h" #include "src/fallback/general_intrinsic/gi_int.h" namespace megdnn { @@ -16,6 +17,9 @@ namespace test { #define SIMD_LEN GI_SIMD_LEN_BYTE / sizeof(float) #define SIMD_LEN_16 GI_SIMD_LEN_BYTE / sizeof(int16_t) #define SIMD_LEN_8 GI_SIMD_LEN_BYTE / sizeof(int8_t) +#if defined(GI_SUPPORT_F16) +#define SIMD_LEN_F16 GI_SIMD_LEN_BYTE / sizeof(gi_float16_t) +#endif template static void init( T* dst, const std::vector& value, const size_t simd_len = SIMD_LEN) { @@ -975,6 +979,23 @@ TEST_F(FALLBACK, GiBroadcastFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiBroadcastFloat16) { + GI_FLOAT16_t ret; + gi_float16_t b = 3672.8932; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiBroadcastFloat16(b); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(b); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiBroadcastInt32) { GI_INT32_t ret; int32_t b = 20220420; @@ -1139,6 +1160,50 @@ TEST_F(FALLBACK, GiCastToFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiCastFloat16ToFloat32) { + GI_FLOAT16_t src0; + GI_FLOAT32_V2_t ret; + std::vector s0{10.34, 32.543, 0.03, 4.76, 89.43, 19.32, 0.0, 78.41}; + s0.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiCastFloat16ToFloat32(src0); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back((float)s0[i]); + } + + assert_eq((float*)&ret, naive, SIMD_LEN_F16); +} + +TEST_F(FALLBACK, GiCastFloat32ToFloat16) { + GI_FLOAT32_t src0, src1; + GI_FLOAT16_t ret; + std::vector s0{10.34, 32.543, 0.03, 4.76}; + std::vector s1{89.43, 19.32, 0.0, 78.41}; + s0.resize(SIMD_LEN); + s1.resize(SIMD_LEN); + init((float*)&src0, s0); + init((float*)&src1, s1); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiCastFloat32ToFloat16(src0, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN; i++) { + naive.push_back((gi_float16_t)s0[i]); + } + for (size_t i = 0; i < SIMD_LEN; i++) { + naive.push_back((gi_float16_t)s1[i]); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiLoadBroadcastFloat32) { GI_FLOAT32_t ret; float p = 2022.0420; @@ -1154,6 +1219,23 @@ TEST_F(FALLBACK, GiLoadBroadcastFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiLoadBroadcastFloat16) { + GI_FLOAT16_t ret; + gi_float16_t p = 4327.3187; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadBroadcastFloat16(&p); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(p); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiZeroFloat32) { GI_FLOAT32_t ret; force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); @@ -1170,6 +1252,24 @@ TEST_F(FALLBACK, GiZeroFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiZeroFloat16) { + GI_FLOAT16_t ret; + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + gi_float16_t p = 0.0; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiZeroFloat16(); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(p); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiLoadFloat32) { GI_FLOAT32_t ret; std::vector s0{2.3f, 4.7f, -1.4f, 1223.6f}; @@ -1186,6 +1286,25 @@ TEST_F(FALLBACK, GiLoadFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiLoadFloat16) { + GI_FLOAT16_t ret; + + std::vector s0{2.3, 4.7, -1.4, 1223.6, 2346.896, 1.23, -908.32, 3.2}; + s0.resize(SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiLoadFloat16(s0.data()); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i]); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiLoadFloat32V2) { GI_FLOAT32_V2_t ret; std::vector s0{2.3f, 4.7f, -1.4f, 1223.6f, 1.1f, 4.0f, 99.7f, 1234.9f}; @@ -1245,6 +1364,32 @@ TEST_F(FALLBACK, GiMlaqFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMlaqFloat16) { + GI_FLOAT16_t src0, src1, src2, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 12.33, 88.43, -11.54, 2.3}; + std::vector s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + s2.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + init((gi_float16_t*)&src2, s2, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMlaqFloat16(src0, src1, src2); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] + (s1[i] * s2[i])); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiUzpqFloat32) { GI_FLOAT32_t src0, src1; GI_FLOAT32_V2_t ret; @@ -1560,6 +1705,20 @@ TEST_F(FALLBACK, GiStoreFloat32) { assert_eq(ret.data(), s0); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiStoreFloat16) { + GI_FLOAT16_t src0; + std::vector s0{2.3, 4.7, -1.4, 1223.6, 2346.896, 1.23, -908.32, 3.2}; + s0.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + std::vector ret{0}; + ret.resize(SIMD_LEN_F16); + + GiStoreFloat16(ret.data(), src0); + assert_eq(ret.data(), s0, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiStoreFloat32V2) { GI_FLOAT32_V2_t src0; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f, -1.1f, -2.2f, -3.5f, -4.9}; @@ -1699,6 +1858,30 @@ TEST_F(FALLBACK, GiAddFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiAddFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 23.56, 79.432, 478.432, 439.21}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiAddFloat16(src0, src1); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] + s1[i]); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiSubtractFloat32) { GI_FLOAT32_t src0, src1, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -1720,6 +1903,30 @@ TEST_F(FALLBACK, GiSubtractFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiSubtractFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 23.56, 79.432, 478.432, 439.21}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiSubtractFloat16(src0, src1); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] - s1[i]); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMultiplyFloat32) { GI_FLOAT32_t src0, src1, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -1741,6 +1948,30 @@ TEST_F(FALLBACK, GiMultiplyFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMultiplyFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 23.56, 79.432, 478.432, 439.21}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMultiplyFloat16(src0, src1); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] * s1[i]); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMultiplyScalerFloat32) { GI_FLOAT32_t src0, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -1761,6 +1992,28 @@ TEST_F(FALLBACK, GiMultiplyScalerFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMultiplyScalerFloat16) { + GI_FLOAT16_t src0, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23}; + s0.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + + gi_float16_t scalar = 3.1415; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMultiplyScalerFloat16(src0, scalar); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] * scalar); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMultiplyAddFloat32) { GI_FLOAT32_t src0, src1, src2, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -1808,6 +2061,31 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMultiplyAddScalarFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23}; + std::vector s1{3.54, 34.2, 7.652, 4.9, 2.154, 5.432, 4.783, 4.326}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + gi_float16_t scalar = 3.1415; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMultiplyAddScalarFloat16(src1, src0, scalar); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s1[i] + s0[i] * scalar); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMultiplySubScalarFloat32) { GI_FLOAT32_t src0, src1, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -1831,6 +2109,31 @@ TEST_F(FALLBACK, GiMultiplySubScalarFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMultiplySubScalarFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23}; + std::vector s1{3.54, 34.2, 7.652, 4.9, 2.154, 5.432, 4.783, 4.3}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + gi_float16_t scalar = 3.1415; + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMultiplySubScalarFloat16(src0, src1, scalar); + + std::vector naive; + + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(s0[i] - s1[i] * scalar); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMultiplyAddLanXXFloat32) { GI_FLOAT32_t src0, src1, src2, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -2169,6 +2472,28 @@ TEST_F(FALLBACK, GiMaximumFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMaximumFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, -3.3, 1.12, 2.75, -6.23}; + std::vector s1{3.54, -34.2, 7.652, 4.9, 2.154, -5.432, 4.783, 4.326}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMaximumFloat16(src0, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(Max(s0[i], s1[i])); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMinimumFloat32) { GI_FLOAT32_t src0, src1, ret; std::vector s0{1.1f, 2.2f, 4.5f, 4.9f}; @@ -2189,6 +2514,28 @@ TEST_F(FALLBACK, GiMinimumFloat32) { assert_eq((float*)&ret, naive); } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiMinimumFloat16) { + GI_FLOAT16_t src0, src1, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, -3.3, 1.12, 2.75, -6.23}; + std::vector s1{3.54, -34.2, 7.652, 4.9, 2.154, -5.432, 4.783, 4.326}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); + ret = GiMinimumFloat16(src0, src1); + + std::vector naive; + for (size_t i = 0; i < SIMD_LEN_F16; i++) { + naive.push_back(Min(s0[i], s1[i])); + } + + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); +} +#endif + TEST_F(FALLBACK, GiMaxNanFloat32) { GI_FLOAT32_t src0, src1, ret; std::vector s0{1.1f, 2.2f, 4.5f, NAN}; @@ -2461,6 +2808,46 @@ TEST_F(FALLBACK, GiSimdFmaLane) { #undef CB } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiSimdFmaLaneFloat16) { + GI_FLOAT16_t src0, src1, src2, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 12.33, 88.43, -11.54, 2.3}; + std::vector s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + s2.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + init((gi_float16_t*)&src2, s2, SIMD_LEN_F16); + + std::vector naive = {0, 0, 0, 0, 0, 0, 0, 0}; + + auto compare = [&](const size_t n) { + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(gi_float16_t); i++) { + naive[i] = s0[i] + (s1[i] * s2[n]); + } + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); + }; + +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiSimdFmaLaneFloat16(src0, src1, src2, n); \ + compare(n); + + CB(0) + CB(1) + CB(2) + CB(3) + CB(4) + CB(5) + CB(6) + CB(7) +#undef CB +} +#endif + TEST_F(FALLBACK, GiMlaqLowLaneFloat32) { GI_FLOAT32_t src0, src1, src2, ret; std::vector s0{1.1f, 2.2f, 3.5f, 4.9f}; @@ -2556,6 +2943,46 @@ TEST_F(FALLBACK, GiFmsqLaneQFloat32) { #undef CB } +#if defined(GI_SUPPORT_F16) +TEST_F(FALLBACK, GiFmsqLaneQFloat16) { + GI_FLOAT16_t src0, src1, src2, ret; + std::vector s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0}; + std::vector s1{2312.1, 345.244, 3.59, -12.8, + 12.33, 88.43, -11.54, 2.3}; + std::vector s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45}; + s0.resize(SIMD_LEN_F16); + s1.resize(SIMD_LEN_F16); + s2.resize(SIMD_LEN_F16); + init((gi_float16_t*)&src0, s0, SIMD_LEN_F16); + init((gi_float16_t*)&src1, s1, SIMD_LEN_F16); + init((gi_float16_t*)&src2, s2, SIMD_LEN_F16); + + std::vector naive = {0, 0, 0, 0, 0, 0, 0, 0}; + + auto compare = [&](const size_t n) { + for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(gi_float16_t); i++) { + naive[i] = s0[i] - (s1[i] * s2[n]); + } + assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16); + }; + +#define CB(n) \ + force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \ + ret = GiFmsqLaneQFloat16(src0, src1, src2, n); \ + compare(n); + + CB(0) + CB(1) + CB(2) + CB(3) + CB(4) + CB(5) + CB(6) + CB(7) +#undef CB +} +#endif + TEST_F(FALLBACK, GiBroadcastUint32) { int32_t src0 = 20220422; GI_UINT32_t ret; diff --git a/toolchains/riscv64-rvv-linux-gnu.toolchain.cmake b/toolchains/riscv64-rvv-linux-gnu.toolchain.cmake index 130799883..620a7a423 100644 --- a/toolchains/riscv64-rvv-linux-gnu.toolchain.cmake +++ b/toolchains/riscv64-rvv-linux-gnu.toolchain.cmake @@ -14,9 +14,9 @@ set(RISCV_TOOLCHAIN_ROOT set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc") set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++") -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7 -mabi=lp64d") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7_zfh -mabi=lp64d") set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -march=rv64gcv0p7 -mabi=lp64d -Wno-error=attributes") + "${CMAKE_CXX_FLAGS} -march=rv64gcv0p7_zfh -mabi=lp64d -Wno-error=attributes") set(CMAKE_FIND_ROOT_PATH "${RISCV_TOOLCHAIN_ROOT}/riscv64-unknown-linux-gnu") set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -- GitLab