提交 45c0e40f 编写于 作者: M Megvii Engine Team

feat(dnn): add some fallback gi fp16 intrinsic

GitOrigin-RevId: 67f091d0b87982452287c0af98b89abf8f065809
上级 5a8ab1f3
......@@ -92,6 +92,13 @@
#undef GI_RVV_INTRINSICS
#endif
//! Gi fp16 only support arm64 neon and rvv
#if (defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && \
MEGDNN_AARCH64) || \
defined(GI_RVV_INTRINSICS)
#define GI_SUPPORT_F16
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
......@@ -127,6 +134,12 @@ enum GiSimdType {
GI_RVV,
};
#if defined(GI_RVV_INTRINSICS)
typedef float16_t gi_float16_t;
#elif defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef __fp16 gi_float16_t;
#endif
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
#define __gi_simd_type GI_AVX
......@@ -139,6 +152,10 @@ typedef __m256i GI_UINT32_t;
#elif defined(GI_NEON_INTRINSICS)
#define __gi_simd_type GI_NEON
typedef float32x4_t GI_FLOAT32_t;
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef float16x8_t GI_FLOAT16_t;
typedef float16x8x2_t GI_FLOAT16_V2_t;
#endif
typedef uint8x16_t GI_UINT8_t;
typedef int8x16_t GI_INT8_t;
typedef int16x8_t GI_INT16_t;
......@@ -287,6 +304,8 @@ typedef __m64_128 float32x2_t;
#elif defined(GI_RVV_INTRINSICS)
#define __gi_simd_type GI_RVV
typedef vfloat32m1_t GI_FLOAT32_t;
typedef vfloat16m1_t GI_FLOAT16_t;
typedef vfloat16m1x2_t GI_FLOAT16_V2_t;
typedef vuint8m1_t GI_UINT8_t;
typedef vint8m1_t GI_INT8_t;
typedef vint16m1_t GI_INT16_t;
......@@ -423,6 +442,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index)
#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index)
#define GiGetSubVectorFloat16V2(s, index) vget_f16m1x2_f16m1(s, index)
#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index)
#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index)
......@@ -437,6 +458,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s)
#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s)
#define GiSetSubVectorFloat16V2(d, index, s) d = vset_f16m1x2(d, index, s)
#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s)
#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s)
......@@ -578,6 +601,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorFloat32V3(s, index) s.val[index]
#define GiGetSubVectorFloat32V4(s, index) s.val[index]
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiGetSubVectorFloat16V2(s, index) s.val[index]
#endif
#define GiGetSubVectorInt32V2(s, index) s.val[index]
#define GiGetSubVectorInt32V4(s, index) s.val[index]
......@@ -592,6 +619,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiSetSubVectorFloat16V2(d, index, s) d.val[index] = s
#endif
#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s
......
#pragma once
#include "gi_common.h"
#if defined(GI_SUPPORT_F16)
//! c + b * a
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
#define v_fma_ps_f16(c, b, a) vfmaq_f16((c), (b), (a))
#define v_fma_n_f16(c, b, a) vfmaq_n_f16((c), (b), (a))
#else
#define v_fma_ps_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), (a)))
#define v_fma_n_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), vdupq_n_f16(a)))
#endif
#endif
GI_FORCEINLINE
GI_FLOAT16_t GiBroadcastFloat16(gi_float16_t Value) {
#if defined(GI_NEON_INTRINSICS)
return vdupq_n_f16(Value);
#elif defined(GI_RVV_INTRINSICS)
return vfmv_v_f_f16m1(Value, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiLoadBroadcastFloat16(const gi_float16_t* Value) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_dup_f16(Value);
#elif defined(GI_RVV_INTRINSICS)
return GiBroadcastFloat16(*Value);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V2_t GiCastFloat16ToFloat32(const GI_FLOAT16_t& fp16) {
#if defined(GI_NEON_INTRINSICS)
GI_FLOAT32_V2_t ret;
GiSetSubVectorFloat32V2(ret, 0, vcvt_f32_f16(vget_low_f16(fp16)));
GiSetSubVectorFloat32V2(ret, 1, vcvt_f32_f16(vget_high_f16(fp16)));
return ret;
#elif defined(GI_RVV_INTRINSICS)
GI_FLOAT32_V2_t ret;
vfloat32m2_t tmp =
vfwcvt_f_f_v_f32m2(fp16, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
GiSetSubVectorFloat32V2(ret, 0, vget_v_f32m2_f32m1(tmp, 0));
GiSetSubVectorFloat32V2(ret, 1, vget_v_f32m2_f32m1(tmp, 1));
return ret;
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiCastFloat32ToFloat16(const GI_FLOAT32_t& low, const GI_FLOAT32_t& high) {
#if defined(GI_NEON_INTRINSICS)
return vcombine_f16(vcvt_f16_f32(low), vcvt_f16_f32(high));
#elif defined(GI_RVV_INTRINSICS)
vfloat32m2_t tmp = vfmv_v_f_f32m2(0.0, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
tmp = vset_v_f32m1_f32m2(tmp, 0, low);
tmp = vset_v_f32m1_f32m2(tmp, 1, high);
return vfncvt_f_f_w_f16m1(tmp, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiZeroFloat16(void) {
return GiBroadcastFloat16(0.0);
}
GI_FORCEINLINE
GI_FLOAT16_t GiLoadFloat16(const gi_float16_t* Buffer) {
#if defined(GI_NEON_INTRINSICS)
return vld1q_f16(Buffer);
#elif defined(GI_RVV_INTRINSICS)
return vle16_v_f16m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
// !return a + b * c
GI_FORCEINLINE
GI_FLOAT16_t GiMlaqFloat16(GI_FLOAT16_t a, GI_FLOAT16_t b, GI_FLOAT16_t c) {
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return vfmaq_f16(a, b, c);
#else
return vaddq_f16(a, vmulq_f16(b, c));
#endif
#elif defined(GI_RVV_INTRINSICS)
return vfmadd_vv_f16m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
void GiStoreFloat16(gi_float16_t* Buffer, GI_FLOAT16_t Vector) {
#if defined(GI_NEON_INTRINSICS)
vst1q_f16(Buffer, Vector);
#elif defined(GI_RVV_INTRINSICS)
vse16_v_f16m1(Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiAddFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vaddq_f16(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfadd_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiSubtractFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_f16(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfsub_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMultiplyFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_f16(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfmul_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMultiplyScalerFloat16(GI_FLOAT16_t Vector1, gi_float16_t Scaler) {
#if defined(GI_NEON_INTRINSICS)
return vmulq_n_f16(Vector1, Scaler);
#elif defined(GI_RVV_INTRINSICS)
return vfmul_vf_f16m1(Vector1, Scaler, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMultiplyAddScalarFloat16(
GI_FLOAT16_t VectorSum, GI_FLOAT16_t Vector, gi_float16_t Scalar) {
#if defined(GI_NEON_INTRINSICS)
return v_fma_n_f16(VectorSum, Vector, Scalar);
#elif defined(GI_RVV_INTRINSICS)
return vfmadd_vf_f16m1(
Vector, Scalar, VectorSum, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMultiplySubScalarFloat16(
GI_FLOAT16_t VectorSub, GI_FLOAT16_t Vector, gi_float16_t Scalar) {
#if defined(GI_NEON_INTRINSICS)
return vsubq_f16(VectorSub, vmulq_n_f16(Vector, Scalar));
#elif defined(GI_RVV_INTRINSICS)
return vfnmsub_vf_f16m1(
Vector, Scalar, VectorSub, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMaximumFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vmaxq_f16(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfmax_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t GiMinimumFloat16(GI_FLOAT16_t Vector1, GI_FLOAT16_t Vector2) {
#if defined(GI_NEON_INTRINSICS)
return vminq_f16(Vector1, Vector2);
#elif defined(GI_RVV_INTRINSICS)
return vfmin_vv_f16m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t));
#endif
}
//! a + b * c[d]
#if defined(GI_NEON_INTRINSICS)
#define GiSimdFmaLaneFloat16(a, b, c, d) vfmaq_laneq_f16(a, b, c, d)
#elif defined(GI_RVV_INTRINSICS)
#define __rvv_fmaq_laneq_f16(__a, __b, __c, __lane) \
__extension__({ \
gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \
vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
GI_FLOAT16_t __ret = vfmadd_vf_f16m1( \
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
__ret; \
})
#define GiSimdFmaLaneFloat16(a, b, c, d) __rvv_fmaq_laneq_f16(a, b, c, d)
#endif
//! a - b * v[lane]
#if defined(GI_NEON_INTRINSICS)
#define GiFmsqLaneQFloat16(a, b, v, lane) vfmsq_laneq_f16(a, b, v, lane)
#elif defined(GI_RVV_INTRINSICS)
#define __rvv_fmsq_lane_float16(__a, __b, __c, __lane) \
__extension__({ \
gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \
vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
GI_FLOAT16_t __ret = vfnmsub_vf_f16m1( \
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
__ret; \
})
#define GiFmsqLaneQFloat16(a, b, c, d) __rvv_fmsq_lane_float16(a, b, c, d)
#endif
#endif
\ No newline at end of file
......@@ -8,6 +8,7 @@ class FALLBACK : public ::testing::Test {};
#endif
#include "src/fallback/general_intrinsic/gi_float.h"
#include "src/fallback/general_intrinsic/gi_float16.h"
#include "src/fallback/general_intrinsic/gi_int.h"
namespace megdnn {
......@@ -16,6 +17,9 @@ namespace test {
#define SIMD_LEN GI_SIMD_LEN_BYTE / sizeof(float)
#define SIMD_LEN_16 GI_SIMD_LEN_BYTE / sizeof(int16_t)
#define SIMD_LEN_8 GI_SIMD_LEN_BYTE / sizeof(int8_t)
#if defined(GI_SUPPORT_F16)
#define SIMD_LEN_F16 GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)
#endif
template <typename T>
static void init(
T* dst, const std::vector<T>& value, const size_t simd_len = SIMD_LEN) {
......@@ -975,6 +979,23 @@ TEST_F(FALLBACK, GiBroadcastFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiBroadcastFloat16) {
GI_FLOAT16_t ret;
gi_float16_t b = 3672.8932;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiBroadcastFloat16(b);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(b);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiBroadcastInt32) {
GI_INT32_t ret;
int32_t b = 20220420;
......@@ -1139,6 +1160,50 @@ TEST_F(FALLBACK, GiCastToFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiCastFloat16ToFloat32) {
GI_FLOAT16_t src0;
GI_FLOAT32_V2_t ret;
std::vector<gi_float16_t> s0{10.34, 32.543, 0.03, 4.76, 89.43, 19.32, 0.0, 78.41};
s0.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiCastFloat16ToFloat32(src0);
std::vector<float> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back((float)s0[i]);
}
assert_eq((float*)&ret, naive, SIMD_LEN_F16);
}
TEST_F(FALLBACK, GiCastFloat32ToFloat16) {
GI_FLOAT32_t src0, src1;
GI_FLOAT16_t ret;
std::vector<float> s0{10.34, 32.543, 0.03, 4.76};
std::vector<float> s1{89.43, 19.32, 0.0, 78.41};
s0.resize(SIMD_LEN);
s1.resize(SIMD_LEN);
init((float*)&src0, s0);
init((float*)&src1, s1);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiCastFloat32ToFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN; i++) {
naive.push_back((gi_float16_t)s0[i]);
}
for (size_t i = 0; i < SIMD_LEN; i++) {
naive.push_back((gi_float16_t)s1[i]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiLoadBroadcastFloat32) {
GI_FLOAT32_t ret;
float p = 2022.0420;
......@@ -1154,6 +1219,23 @@ TEST_F(FALLBACK, GiLoadBroadcastFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiLoadBroadcastFloat16) {
GI_FLOAT16_t ret;
gi_float16_t p = 4327.3187;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiLoadBroadcastFloat16(&p);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(p);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiZeroFloat32) {
GI_FLOAT32_t ret;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
......@@ -1170,6 +1252,24 @@ TEST_F(FALLBACK, GiZeroFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiZeroFloat16) {
GI_FLOAT16_t ret;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
gi_float16_t p = 0.0;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiZeroFloat16();
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(p);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiLoadFloat32) {
GI_FLOAT32_t ret;
std::vector<float> s0{2.3f, 4.7f, -1.4f, 1223.6f};
......@@ -1186,6 +1286,25 @@ TEST_F(FALLBACK, GiLoadFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiLoadFloat16) {
GI_FLOAT16_t ret;
std::vector<gi_float16_t> s0{2.3, 4.7, -1.4, 1223.6, 2346.896, 1.23, -908.32, 3.2};
s0.resize(SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiLoadFloat16(s0.data());
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiLoadFloat32V2) {
GI_FLOAT32_V2_t ret;
std::vector<float> s0{2.3f, 4.7f, -1.4f, 1223.6f, 1.1f, 4.0f, 99.7f, 1234.9f};
......@@ -1245,6 +1364,32 @@ TEST_F(FALLBACK, GiMlaqFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMlaqFloat16) {
GI_FLOAT16_t src0, src1, src2, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
12.33, 88.43, -11.54, 2.3};
std::vector<gi_float16_t> s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
s2.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
init((gi_float16_t*)&src2, s2, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMlaqFloat16(src0, src1, src2);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] + (s1[i] * s2[i]));
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiUzpqFloat32) {
GI_FLOAT32_t src0, src1;
GI_FLOAT32_V2_t ret;
......@@ -1560,6 +1705,20 @@ TEST_F(FALLBACK, GiStoreFloat32) {
assert_eq(ret.data(), s0);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiStoreFloat16) {
GI_FLOAT16_t src0;
std::vector<gi_float16_t> s0{2.3, 4.7, -1.4, 1223.6, 2346.896, 1.23, -908.32, 3.2};
s0.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
std::vector<gi_float16_t> ret{0};
ret.resize(SIMD_LEN_F16);
GiStoreFloat16(ret.data(), src0);
assert_eq(ret.data(), s0, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiStoreFloat32V2) {
GI_FLOAT32_V2_t src0;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f, -1.1f, -2.2f, -3.5f, -4.9};
......@@ -1699,6 +1858,30 @@ TEST_F(FALLBACK, GiAddFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiAddFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
23.56, 79.432, 478.432, 439.21};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiAddFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] + s1[i]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiSubtractFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -1720,6 +1903,30 @@ TEST_F(FALLBACK, GiSubtractFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiSubtractFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
23.56, 79.432, 478.432, 439.21};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiSubtractFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] - s1[i]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMultiplyFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -1741,6 +1948,30 @@ TEST_F(FALLBACK, GiMultiplyFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMultiplyFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, -4.5, 2.3, 4.5, 1.45};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
23.56, 79.432, 478.432, 439.21};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMultiplyFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] * s1[i]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMultiplyScalerFloat32) {
GI_FLOAT32_t src0, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -1761,6 +1992,28 @@ TEST_F(FALLBACK, GiMultiplyScalerFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMultiplyScalerFloat16) {
GI_FLOAT16_t src0, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23};
s0.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
gi_float16_t scalar = 3.1415;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMultiplyScalerFloat16(src0, scalar);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] * scalar);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMultiplyAddFloat32) {
GI_FLOAT32_t src0, src1, src2, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -1808,6 +2061,31 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMultiplyAddScalarFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23};
std::vector<gi_float16_t> s1{3.54, 34.2, 7.652, 4.9, 2.154, 5.432, 4.783, 4.326};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
gi_float16_t scalar = 3.1415;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMultiplyAddScalarFloat16(src1, src0, scalar);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s1[i] + s0[i] * scalar);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMultiplySubScalarFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -1831,6 +2109,31 @@ TEST_F(FALLBACK, GiMultiplySubScalarFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMultiplySubScalarFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 3.3, 1.12, 2.75, 6.23};
std::vector<gi_float16_t> s1{3.54, 34.2, 7.652, 4.9, 2.154, 5.432, 4.783, 4.3};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
gi_float16_t scalar = 3.1415;
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMultiplySubScalarFloat16(src0, src1, scalar);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(s0[i] - s1[i] * scalar);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMultiplyAddLanXXFloat32) {
GI_FLOAT32_t src0, src1, src2, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -2169,6 +2472,28 @@ TEST_F(FALLBACK, GiMaximumFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMaximumFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, -3.3, 1.12, 2.75, -6.23};
std::vector<gi_float16_t> s1{3.54, -34.2, 7.652, 4.9, 2.154, -5.432, 4.783, 4.326};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMaximumFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(Max(s0[i], s1[i]));
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMinimumFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 4.5f, 4.9f};
......@@ -2189,6 +2514,28 @@ TEST_F(FALLBACK, GiMinimumFloat32) {
assert_eq((float*)&ret, naive);
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiMinimumFloat16) {
GI_FLOAT16_t src0, src1, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, -3.3, 1.12, 2.75, -6.23};
std::vector<gi_float16_t> s1{3.54, -34.2, 7.652, 4.9, 2.154, -5.432, 4.783, 4.326};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE);
ret = GiMinimumFloat16(src0, src1);
std::vector<gi_float16_t> naive;
for (size_t i = 0; i < SIMD_LEN_F16; i++) {
naive.push_back(Min(s0[i], s1[i]));
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
}
#endif
TEST_F(FALLBACK, GiMaxNanFloat32) {
GI_FLOAT32_t src0, src1, ret;
std::vector<float> s0{1.1f, 2.2f, 4.5f, NAN};
......@@ -2461,6 +2808,46 @@ TEST_F(FALLBACK, GiSimdFmaLane) {
#undef CB
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiSimdFmaLaneFloat16) {
GI_FLOAT16_t src0, src1, src2, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
12.33, 88.43, -11.54, 2.3};
std::vector<gi_float16_t> s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
s2.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
init((gi_float16_t*)&src2, s2, SIMD_LEN_F16);
std::vector<gi_float16_t> naive = {0, 0, 0, 0, 0, 0, 0, 0};
auto compare = [&](const size_t n) {
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(gi_float16_t); i++) {
naive[i] = s0[i] + (s1[i] * s2[n]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
};
#define CB(n) \
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \
ret = GiSimdFmaLaneFloat16(src0, src1, src2, n); \
compare(n);
CB(0)
CB(1)
CB(2)
CB(3)
CB(4)
CB(5)
CB(6)
CB(7)
#undef CB
}
#endif
TEST_F(FALLBACK, GiMlaqLowLaneFloat32) {
GI_FLOAT32_t src0, src1, src2, ret;
std::vector<float> s0{1.1f, 2.2f, 3.5f, 4.9f};
......@@ -2556,6 +2943,46 @@ TEST_F(FALLBACK, GiFmsqLaneQFloat32) {
#undef CB
}
#if defined(GI_SUPPORT_F16)
TEST_F(FALLBACK, GiFmsqLaneQFloat16) {
GI_FLOAT16_t src0, src1, src2, ret;
std::vector<gi_float16_t> s0{1.1, 2.2, 3.5, 4.9, 4.2, 9.4, 9.0, 0.0};
std::vector<gi_float16_t> s1{2312.1, 345.244, 3.59, -12.8,
12.33, 88.43, -11.54, 2.3};
std::vector<gi_float16_t> s2{1.2, -3.1, 9.0, 11.2, 4.68, -32.85, 899.43, -0.45};
s0.resize(SIMD_LEN_F16);
s1.resize(SIMD_LEN_F16);
s2.resize(SIMD_LEN_F16);
init((gi_float16_t*)&src0, s0, SIMD_LEN_F16);
init((gi_float16_t*)&src1, s1, SIMD_LEN_F16);
init((gi_float16_t*)&src2, s2, SIMD_LEN_F16);
std::vector<gi_float16_t> naive = {0, 0, 0, 0, 0, 0, 0, 0};
auto compare = [&](const size_t n) {
for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(gi_float16_t); i++) {
naive[i] = s0[i] - (s1[i] * s2[n]);
}
assert_eq((gi_float16_t*)&ret, naive, SIMD_LEN_F16);
};
#define CB(n) \
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \
ret = GiFmsqLaneQFloat16(src0, src1, src2, n); \
compare(n);
CB(0)
CB(1)
CB(2)
CB(3)
CB(4)
CB(5)
CB(6)
CB(7)
#undef CB
}
#endif
TEST_F(FALLBACK, GiBroadcastUint32) {
int32_t src0 = 20220422;
GI_UINT32_t ret;
......
......@@ -14,9 +14,9 @@ set(RISCV_TOOLCHAIN_ROOT
set(CMAKE_C_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc")
set(CMAKE_CXX_COMPILER "${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-g++")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7 -mabi=lp64d")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7_zfh -mabi=lp64d")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -march=rv64gcv0p7 -mabi=lp64d -Wno-error=attributes")
"${CMAKE_CXX_FLAGS} -march=rv64gcv0p7_zfh -mabi=lp64d -Wno-error=attributes")
set(CMAKE_FIND_ROOT_PATH "${RISCV_TOOLCHAIN_ROOT}/riscv64-unknown-linux-gnu")
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册