Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
45c0e40f
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
45c0e40f
编写于
1月 31, 2023
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn): add some fallback gi fp16 intrinsic
GitOrigin-RevId: 67f091d0b87982452287c0af98b89abf8f065809
上级
5a8ab1f3
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
668 addition
and
2 deletion
+668
-2
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+31
-0
dnn/src/fallback/general_intrinsic/gi_float16.h
dnn/src/fallback/general_intrinsic/gi_float16.h
+208
-0
dnn/test/fallback/gi.cpp
dnn/test/fallback/gi.cpp
+427
-0
toolchains/riscv64-rvv-linux-gnu.toolchain.cmake
toolchains/riscv64-rvv-linux-gnu.toolchain.cmake
+2
-2
未找到文件。
dnn/src/fallback/general_intrinsic/gi_common.h
浏览文件 @
45c0e40f
...
...
@@ -92,6 +92,13 @@
#undef GI_RVV_INTRINSICS
#endif
//! Gi fp16 only support arm64 neon and rvv
#if (defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && \
MEGDNN_AARCH64) || \
defined(GI_RVV_INTRINSICS)
#define GI_SUPPORT_F16
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
...
...
@@ -127,6 +134,12 @@ enum GiSimdType {
GI_RVV
,
};
#if defined(GI_RVV_INTRINSICS)
typedef
float16_t
gi_float16_t
;
#elif defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef
__fp16
gi_float16_t
;
#endif
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
#define __gi_simd_type GI_AVX
...
...
@@ -139,6 +152,10 @@ typedef __m256i GI_UINT32_t;
#elif defined(GI_NEON_INTRINSICS)
#define __gi_simd_type GI_NEON
typedef
float32x4_t
GI_FLOAT32_t
;
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
typedef
float16x8_t
GI_FLOAT16_t
;
typedef
float16x8x2_t
GI_FLOAT16_V2_t
;
#endif
typedef
uint8x16_t
GI_UINT8_t
;
typedef
int8x16_t
GI_INT8_t
;
typedef
int16x8_t
GI_INT16_t
;
...
...
@@ -287,6 +304,8 @@ typedef __m64_128 float32x2_t;
#elif defined(GI_RVV_INTRINSICS)
#define __gi_simd_type GI_RVV
typedef
vfloat32m1_t
GI_FLOAT32_t
;
typedef
vfloat16m1_t
GI_FLOAT16_t
;
typedef
vfloat16m1x2_t
GI_FLOAT16_V2_t
;
typedef
vuint8m1_t
GI_UINT8_t
;
typedef
vint8m1_t
GI_INT8_t
;
typedef
vint16m1_t
GI_INT16_t
;
...
...
@@ -423,6 +442,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorFloat32V3(s, index) vget_f32m1x3_f32m1(s, index)
#define GiGetSubVectorFloat32V4(s, index) vget_f32m1x4_f32m1(s, index)
#define GiGetSubVectorFloat16V2(s, index) vget_f16m1x2_f16m1(s, index)
#define GiGetSubVectorInt32V2(s, index) vget_i32m1x2_i32m1(s, index)
#define GiGetSubVectorInt32V4(s, index) vget_i32m1x4_i32m1(s, index)
...
...
@@ -437,6 +458,8 @@ typedef GI_UINT32_NAIVE_t GI_UINT32_FIXLEN_t;
#define GiSetSubVectorFloat32V3(d, index, s) d = vset_f32m1x3(d, index, s)
#define GiSetSubVectorFloat32V4(d, index, s) d = vset_f32m1x4(d, index, s)
#define GiSetSubVectorFloat16V2(d, index, s) d = vset_f16m1x2(d, index, s)
#define GiSetSubVectorInt32V2(d, index, s) d = vset_i32m1x2(d, index, s)
#define GiSetSubVectorInt32V4(d, index, s) d = vset_i32m1x4(d, index, s)
...
...
@@ -578,6 +601,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiGetSubVectorFloat32V3(s, index) s.val[index]
#define GiGetSubVectorFloat32V4(s, index) s.val[index]
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiGetSubVectorFloat16V2(s, index) s.val[index]
#endif
#define GiGetSubVectorInt32V2(s, index) s.val[index]
#define GiGetSubVectorInt32V4(s, index) s.val[index]
...
...
@@ -592,6 +619,10 @@ typedef GI_UINT32_t GI_UINT32_FIXLEN_t;
#define GiSetSubVectorFloat32V3(d, index, s) d.val[index] = s
#define GiSetSubVectorFloat32V4(d, index, s) d.val[index] = s
#if defined(GI_NEON_INTRINSICS) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#define GiSetSubVectorFloat16V2(d, index, s) d.val[index] = s
#endif
#define GiSetSubVectorInt32V2(d, index, s) d.val[index] = s
#define GiSetSubVectorInt32V4(d, index, s) d.val[index] = s
...
...
dnn/src/fallback/general_intrinsic/gi_float16.h
0 → 100644
浏览文件 @
45c0e40f
#pragma once
#include "gi_common.h"
#if defined(GI_SUPPORT_F16)
//! c + b * a
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
#define v_fma_ps_f16(c, b, a) vfmaq_f16((c), (b), (a))
#define v_fma_n_f16(c, b, a) vfmaq_n_f16((c), (b), (a))
#else
#define v_fma_ps_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), (a)))
#define v_fma_n_f16(c, b, a) vaddq_f16((c), vmulq_f16((b), vdupq_n_f16(a)))
#endif
#endif
GI_FORCEINLINE
GI_FLOAT16_t
GiBroadcastFloat16
(
gi_float16_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f16
(
Value
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmv_v_f_f16m1
(
Value
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiLoadBroadcastFloat16
(
const
gi_float16_t
*
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_dup_f16
(
Value
);
#elif defined(GI_RVV_INTRINSICS)
return
GiBroadcastFloat16
(
*
Value
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_V2_t
GiCastFloat16ToFloat32
(
const
GI_FLOAT16_t
&
fp16
)
{
#if defined(GI_NEON_INTRINSICS)
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vcvt_f32_f16
(
vget_low_f16
(
fp16
)));
GiSetSubVectorFloat32V2
(
ret
,
1
,
vcvt_f32_f16
(
vget_high_f16
(
fp16
)));
return
ret
;
#elif defined(GI_RVV_INTRINSICS)
GI_FLOAT32_V2_t
ret
;
vfloat32m2_t
tmp
=
vfwcvt_f_f_v_f32m2
(
fp16
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
GiSetSubVectorFloat32V2
(
ret
,
0
,
vget_v_f32m2_f32m1
(
tmp
,
0
));
GiSetSubVectorFloat32V2
(
ret
,
1
,
vget_v_f32m2_f32m1
(
tmp
,
1
));
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiCastFloat32ToFloat16
(
const
GI_FLOAT32_t
&
low
,
const
GI_FLOAT32_t
&
high
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcombine_f16
(
vcvt_f16_f32
(
low
),
vcvt_f16_f32
(
high
));
#elif defined(GI_RVV_INTRINSICS)
vfloat32m2_t
tmp
=
vfmv_v_f_f32m2
(
0
.
0
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
tmp
=
vset_v_f32m1_f32m2
(
tmp
,
0
,
low
);
tmp
=
vset_v_f32m1_f32m2
(
tmp
,
1
,
high
);
return
vfncvt_f_f_w_f16m1
(
tmp
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiZeroFloat16
(
void
)
{
return
GiBroadcastFloat16
(
0
.
0
);
}
GI_FORCEINLINE
GI_FLOAT16_t
GiLoadFloat16
(
const
gi_float16_t
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_f16
(
Buffer
);
#elif defined(GI_RVV_INTRINSICS)
return
vle16_v_f16m1
(
Buffer
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
// !return a + b * c
GI_FORCEINLINE
GI_FLOAT16_t
GiMlaqFloat16
(
GI_FLOAT16_t
a
,
GI_FLOAT16_t
b
,
GI_FLOAT16_t
c
)
{
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return
vfmaq_f16
(
a
,
b
,
c
);
#else
return
vaddq_f16
(
a
,
vmulq_f16
(
b
,
c
));
#endif
#elif defined(GI_RVV_INTRINSICS)
return
vfmadd_vv_f16m1
(
b
,
c
,
a
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
void
GiStoreFloat16
(
gi_float16_t
*
Buffer
,
GI_FLOAT16_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f16
(
Buffer
,
Vector
);
#elif defined(GI_RVV_INTRINSICS)
vse16_v_f16m1
(
Buffer
,
Vector
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiAddFloat16
(
GI_FLOAT16_t
Vector1
,
GI_FLOAT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_f16
(
Vector1
,
Vector2
);
#elif defined(GI_RVV_INTRINSICS)
return
vfadd_vv_f16m1
(
Vector1
,
Vector2
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiSubtractFloat16
(
GI_FLOAT16_t
Vector1
,
GI_FLOAT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_f16
(
Vector1
,
Vector2
);
#elif defined(GI_RVV_INTRINSICS)
return
vfsub_vv_f16m1
(
Vector1
,
Vector2
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMultiplyFloat16
(
GI_FLOAT16_t
Vector1
,
GI_FLOAT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_f16
(
Vector1
,
Vector2
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmul_vv_f16m1
(
Vector1
,
Vector2
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMultiplyScalerFloat16
(
GI_FLOAT16_t
Vector1
,
gi_float16_t
Scaler
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_n_f16
(
Vector1
,
Scaler
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmul_vf_f16m1
(
Vector1
,
Scaler
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMultiplyAddScalarFloat16
(
GI_FLOAT16_t
VectorSum
,
GI_FLOAT16_t
Vector
,
gi_float16_t
Scalar
)
{
#if defined(GI_NEON_INTRINSICS)
return
v_fma_n_f16
(
VectorSum
,
Vector
,
Scalar
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmadd_vf_f16m1
(
Vector
,
Scalar
,
VectorSum
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMultiplySubScalarFloat16
(
GI_FLOAT16_t
VectorSub
,
GI_FLOAT16_t
Vector
,
gi_float16_t
Scalar
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_f16
(
VectorSub
,
vmulq_n_f16
(
Vector
,
Scalar
));
#elif defined(GI_RVV_INTRINSICS)
return
vfnmsub_vf_f16m1
(
Vector
,
Scalar
,
VectorSub
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMaximumFloat16
(
GI_FLOAT16_t
Vector1
,
GI_FLOAT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f16
(
Vector1
,
Vector2
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmax_vv_f16m1
(
Vector1
,
Vector2
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
GI_FORCEINLINE
GI_FLOAT16_t
GiMinimumFloat16
(
GI_FLOAT16_t
Vector1
,
GI_FLOAT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f16
(
Vector1
,
Vector2
);
#elif defined(GI_RVV_INTRINSICS)
return
vfmin_vv_f16m1
(
Vector1
,
Vector2
,
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
));
#endif
}
//! a + b * c[d]
#if defined(GI_NEON_INTRINSICS)
#define GiSimdFmaLaneFloat16(a, b, c, d) vfmaq_laneq_f16(a, b, c, d)
#elif defined(GI_RVV_INTRINSICS)
#define __rvv_fmaq_laneq_f16(__a, __b, __c, __lane) \
__extension__({ \
gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \
vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
GI_FLOAT16_t __ret = vfmadd_vf_f16m1( \
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
__ret; \
})
#define GiSimdFmaLaneFloat16(a, b, c, d) __rvv_fmaq_laneq_f16(a, b, c, d)
#endif
//! a - b * v[lane]
#if defined(GI_NEON_INTRINSICS)
#define GiFmsqLaneQFloat16(a, b, v, lane) vfmsq_laneq_f16(a, b, v, lane)
#elif defined(GI_RVV_INTRINSICS)
#define __rvv_fmsq_lane_float16(__a, __b, __c, __lane) \
__extension__({ \
gi_float16_t t[GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)]; \
vse16_v_f16m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
GI_FLOAT16_t __ret = vfnmsub_vf_f16m1( \
__b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)); \
__ret; \
})
#define GiFmsqLaneQFloat16(a, b, c, d) __rvv_fmsq_lane_float16(a, b, c, d)
#endif
#endif
\ No newline at end of file
dnn/test/fallback/gi.cpp
浏览文件 @
45c0e40f
...
...
@@ -8,6 +8,7 @@ class FALLBACK : public ::testing::Test {};
#endif
#include "src/fallback/general_intrinsic/gi_float.h"
#include "src/fallback/general_intrinsic/gi_float16.h"
#include "src/fallback/general_intrinsic/gi_int.h"
namespace
megdnn
{
...
...
@@ -16,6 +17,9 @@ namespace test {
#define SIMD_LEN GI_SIMD_LEN_BYTE / sizeof(float)
#define SIMD_LEN_16 GI_SIMD_LEN_BYTE / sizeof(int16_t)
#define SIMD_LEN_8 GI_SIMD_LEN_BYTE / sizeof(int8_t)
#if defined(GI_SUPPORT_F16)
#define SIMD_LEN_F16 GI_SIMD_LEN_BYTE / sizeof(gi_float16_t)
#endif
template
<
typename
T
>
static
void
init
(
T
*
dst
,
const
std
::
vector
<
T
>&
value
,
const
size_t
simd_len
=
SIMD_LEN
)
{
...
...
@@ -975,6 +979,23 @@ TEST_F(FALLBACK, GiBroadcastFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiBroadcastFloat16
)
{
GI_FLOAT16_t
ret
;
gi_float16_t
b
=
3672.8932
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiBroadcastFloat16
(
b
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
b
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiBroadcastInt32
)
{
GI_INT32_t
ret
;
int32_t
b
=
20220420
;
...
...
@@ -1139,6 +1160,50 @@ TEST_F(FALLBACK, GiCastToFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiCastFloat16ToFloat32
)
{
GI_FLOAT16_t
src0
;
GI_FLOAT32_V2_t
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
10.34
,
32.543
,
0.03
,
4.76
,
89.43
,
19.32
,
0.0
,
78.41
};
s0
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiCastFloat16ToFloat32
(
src0
);
std
::
vector
<
float
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
((
float
)
s0
[
i
]);
}
assert_eq
((
float
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
TEST_F
(
FALLBACK
,
GiCastFloat32ToFloat16
)
{
GI_FLOAT32_t
src0
,
src1
;
GI_FLOAT16_t
ret
;
std
::
vector
<
float
>
s0
{
10.34
,
32.543
,
0.03
,
4.76
};
std
::
vector
<
float
>
s1
{
89.43
,
19.32
,
0.0
,
78.41
};
s0
.
resize
(
SIMD_LEN
);
s1
.
resize
(
SIMD_LEN
);
init
((
float
*
)
&
src0
,
s0
);
init
((
float
*
)
&
src1
,
s1
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiCastFloat32ToFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN
;
i
++
)
{
naive
.
push_back
((
gi_float16_t
)
s0
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
SIMD_LEN
;
i
++
)
{
naive
.
push_back
((
gi_float16_t
)
s1
[
i
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiLoadBroadcastFloat32
)
{
GI_FLOAT32_t
ret
;
float
p
=
2022.0420
;
...
...
@@ -1154,6 +1219,23 @@ TEST_F(FALLBACK, GiLoadBroadcastFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiLoadBroadcastFloat16
)
{
GI_FLOAT16_t
ret
;
gi_float16_t
p
=
4327.3187
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiLoadBroadcastFloat16
(
&
p
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
p
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiZeroFloat32
)
{
GI_FLOAT32_t
ret
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
...
...
@@ -1170,6 +1252,24 @@ TEST_F(FALLBACK, GiZeroFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiZeroFloat16
)
{
GI_FLOAT16_t
ret
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
gi_float16_t
p
=
0.0
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiZeroFloat16
();
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
p
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiLoadFloat32
)
{
GI_FLOAT32_t
ret
;
std
::
vector
<
float
>
s0
{
2.3
f
,
4.7
f
,
-
1.4
f
,
1223.6
f
};
...
...
@@ -1186,6 +1286,25 @@ TEST_F(FALLBACK, GiLoadFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiLoadFloat16
)
{
GI_FLOAT16_t
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
2.3
,
4.7
,
-
1.4
,
1223.6
,
2346.896
,
1.23
,
-
908.32
,
3.2
};
s0
.
resize
(
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiLoadFloat16
(
s0
.
data
());
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiLoadFloat32V2
)
{
GI_FLOAT32_V2_t
ret
;
std
::
vector
<
float
>
s0
{
2.3
f
,
4.7
f
,
-
1.4
f
,
1223.6
f
,
1.1
f
,
4.0
f
,
99.7
f
,
1234.9
f
};
...
...
@@ -1245,6 +1364,32 @@ TEST_F(FALLBACK, GiMlaqFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMlaqFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
4.2
,
9.4
,
9.0
,
0.0
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
12.33
,
88.43
,
-
11.54
,
2.3
};
std
::
vector
<
gi_float16_t
>
s2
{
1.2
,
-
3.1
,
9.0
,
11.2
,
4.68
,
-
32.85
,
899.43
,
-
0.45
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
s2
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src2
,
s2
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMlaqFloat16
(
src0
,
src1
,
src2
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
+
(
s1
[
i
]
*
s2
[
i
]));
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiUzpqFloat32
)
{
GI_FLOAT32_t
src0
,
src1
;
GI_FLOAT32_V2_t
ret
;
...
...
@@ -1560,6 +1705,20 @@ TEST_F(FALLBACK, GiStoreFloat32) {
assert_eq
(
ret
.
data
(),
s0
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiStoreFloat16
)
{
GI_FLOAT16_t
src0
;
std
::
vector
<
gi_float16_t
>
s0
{
2.3
,
4.7
,
-
1.4
,
1223.6
,
2346.896
,
1.23
,
-
908.32
,
3.2
};
s0
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
std
::
vector
<
gi_float16_t
>
ret
{
0
};
ret
.
resize
(
SIMD_LEN_F16
);
GiStoreFloat16
(
ret
.
data
(),
src0
);
assert_eq
(
ret
.
data
(),
s0
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiStoreFloat32V2
)
{
GI_FLOAT32_V2_t
src0
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
,
-
1.1
f
,
-
2.2
f
,
-
3.5
f
,
-
4.9
};
...
...
@@ -1699,6 +1858,30 @@ TEST_F(FALLBACK, GiAddFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiAddFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
-
4.5
,
2.3
,
4.5
,
1.45
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
23.56
,
79.432
,
478.432
,
439.21
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiAddFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
+
s1
[
i
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiSubtractFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -1720,6 +1903,30 @@ TEST_F(FALLBACK, GiSubtractFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiSubtractFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
-
4.5
,
2.3
,
4.5
,
1.45
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
23.56
,
79.432
,
478.432
,
439.21
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiSubtractFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
-
s1
[
i
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMultiplyFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -1741,6 +1948,30 @@ TEST_F(FALLBACK, GiMultiplyFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMultiplyFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
-
4.5
,
2.3
,
4.5
,
1.45
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
23.56
,
79.432
,
478.432
,
439.21
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMultiplyFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
*
s1
[
i
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMultiplyScalerFloat32
)
{
GI_FLOAT32_t
src0
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -1761,6 +1992,28 @@ TEST_F(FALLBACK, GiMultiplyScalerFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMultiplyScalerFloat16
)
{
GI_FLOAT16_t
src0
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
3.3
,
1.12
,
2.75
,
6.23
};
s0
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
gi_float16_t
scalar
=
3.1415
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMultiplyScalerFloat16
(
src0
,
scalar
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
*
scalar
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMultiplyAddFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -1808,6 +2061,31 @@ TEST_F(FALLBACK, GiMultiplyAddScalarFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMultiplyAddScalarFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
3.3
,
1.12
,
2.75
,
6.23
};
std
::
vector
<
gi_float16_t
>
s1
{
3.54
,
34.2
,
7.652
,
4.9
,
2.154
,
5.432
,
4.783
,
4.326
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
gi_float16_t
scalar
=
3.1415
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMultiplyAddScalarFloat16
(
src1
,
src0
,
scalar
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s1
[
i
]
+
s0
[
i
]
*
scalar
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMultiplySubScalarFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -1831,6 +2109,31 @@ TEST_F(FALLBACK, GiMultiplySubScalarFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMultiplySubScalarFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
3.3
,
1.12
,
2.75
,
6.23
};
std
::
vector
<
gi_float16_t
>
s1
{
3.54
,
34.2
,
7.652
,
4.9
,
2.154
,
5.432
,
4.783
,
4.3
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
gi_float16_t
scalar
=
3.1415
;
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMultiplySubScalarFloat16
(
src0
,
src1
,
scalar
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
s0
[
i
]
-
s1
[
i
]
*
scalar
);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMultiplyAddLanXXFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -2169,6 +2472,28 @@ TEST_F(FALLBACK, GiMaximumFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMaximumFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
-
3.3
,
1.12
,
2.75
,
-
6.23
};
std
::
vector
<
gi_float16_t
>
s1
{
3.54
,
-
34.2
,
7.652
,
4.9
,
2.154
,
-
5.432
,
4.783
,
4.326
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMaximumFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
Max
(
s0
[
i
],
s1
[
i
]));
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMinimumFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
4.5
f
,
4.9
f
};
...
...
@@ -2189,6 +2514,28 @@ TEST_F(FALLBACK, GiMinimumFloat32) {
assert_eq
((
float
*
)
&
ret
,
naive
);
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiMinimumFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
-
3.3
,
1.12
,
2.75
,
-
6.23
};
std
::
vector
<
gi_float16_t
>
s1
{
3.54
,
-
34.2
,
7.652
,
4.9
,
2.154
,
-
5.432
,
4.783
,
4.326
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
force_memset_ret
((
void
*
)
&
ret
,
GI_SIMD_LEN_BYTE
);
ret
=
GiMinimumFloat16
(
src0
,
src1
);
std
::
vector
<
gi_float16_t
>
naive
;
for
(
size_t
i
=
0
;
i
<
SIMD_LEN_F16
;
i
++
)
{
naive
.
push_back
(
Min
(
s0
[
i
],
s1
[
i
]));
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
}
#endif
TEST_F
(
FALLBACK
,
GiMaxNanFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
4.5
f
,
NAN
};
...
...
@@ -2461,6 +2808,46 @@ TEST_F(FALLBACK, GiSimdFmaLane) {
#undef CB
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiSimdFmaLaneFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
4.2
,
9.4
,
9.0
,
0.0
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
12.33
,
88.43
,
-
11.54
,
2.3
};
std
::
vector
<
gi_float16_t
>
s2
{
1.2
,
-
3.1
,
9.0
,
11.2
,
4.68
,
-
32.85
,
899.43
,
-
0.45
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
s2
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src2
,
s2
,
SIMD_LEN_F16
);
std
::
vector
<
gi_float16_t
>
naive
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
auto
compare
=
[
&
](
const
size_t
n
)
{
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
);
i
++
)
{
naive
[
i
]
=
s0
[
i
]
+
(
s1
[
i
]
*
s2
[
n
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
};
#define CB(n) \
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \
ret = GiSimdFmaLaneFloat16(src0, src1, src2, n); \
compare(n);
CB
(
0
)
CB
(
1
)
CB
(
2
)
CB
(
3
)
CB
(
4
)
CB
(
5
)
CB
(
6
)
CB
(
7
)
#undef CB
}
#endif
TEST_F
(
FALLBACK
,
GiMlaqLowLaneFloat32
)
{
GI_FLOAT32_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
float
>
s0
{
1.1
f
,
2.2
f
,
3.5
f
,
4.9
f
};
...
...
@@ -2556,6 +2943,46 @@ TEST_F(FALLBACK, GiFmsqLaneQFloat32) {
#undef CB
}
#if defined(GI_SUPPORT_F16)
TEST_F
(
FALLBACK
,
GiFmsqLaneQFloat16
)
{
GI_FLOAT16_t
src0
,
src1
,
src2
,
ret
;
std
::
vector
<
gi_float16_t
>
s0
{
1.1
,
2.2
,
3.5
,
4.9
,
4.2
,
9.4
,
9.0
,
0.0
};
std
::
vector
<
gi_float16_t
>
s1
{
2312.1
,
345.244
,
3.59
,
-
12.8
,
12.33
,
88.43
,
-
11.54
,
2.3
};
std
::
vector
<
gi_float16_t
>
s2
{
1.2
,
-
3.1
,
9.0
,
11.2
,
4.68
,
-
32.85
,
899.43
,
-
0.45
};
s0
.
resize
(
SIMD_LEN_F16
);
s1
.
resize
(
SIMD_LEN_F16
);
s2
.
resize
(
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src0
,
s0
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src1
,
s1
,
SIMD_LEN_F16
);
init
((
gi_float16_t
*
)
&
src2
,
s2
,
SIMD_LEN_F16
);
std
::
vector
<
gi_float16_t
>
naive
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
auto
compare
=
[
&
](
const
size_t
n
)
{
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
gi_float16_t
);
i
++
)
{
naive
[
i
]
=
s0
[
i
]
-
(
s1
[
i
]
*
s2
[
n
]);
}
assert_eq
((
gi_float16_t
*
)
&
ret
,
naive
,
SIMD_LEN_F16
);
};
#define CB(n) \
force_memset_ret((void*)&ret, GI_SIMD_LEN_BYTE); \
ret = GiFmsqLaneQFloat16(src0, src1, src2, n); \
compare(n);
CB
(
0
)
CB
(
1
)
CB
(
2
)
CB
(
3
)
CB
(
4
)
CB
(
5
)
CB
(
6
)
CB
(
7
)
#undef CB
}
#endif
TEST_F
(
FALLBACK
,
GiBroadcastUint32
)
{
int32_t
src0
=
20220422
;
GI_UINT32_t
ret
;
...
...
toolchains/riscv64-rvv-linux-gnu.toolchain.cmake
浏览文件 @
45c0e40f
...
...
@@ -14,9 +14,9 @@ set(RISCV_TOOLCHAIN_ROOT
set
(
CMAKE_C_COMPILER
"
${
RISCV_TOOLCHAIN_ROOT
}
/bin/riscv64-unknown-linux-gnu-gcc"
)
set
(
CMAKE_CXX_COMPILER
"
${
RISCV_TOOLCHAIN_ROOT
}
/bin/riscv64-unknown-linux-gnu-g++"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-march=rv64gcv0p7 -mabi=lp64d"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-march=rv64gcv0p7
_zfh
-mabi=lp64d"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-march=rv64gcv0p7 -mabi=lp64d -Wno-error=attributes"
)
"
${
CMAKE_CXX_FLAGS
}
-march=rv64gcv0p7
_zfh
-mabi=lp64d -Wno-error=attributes"
)
set
(
CMAKE_FIND_ROOT_PATH
"
${
RISCV_TOOLCHAIN_ROOT
}
/riscv64-unknown-linux-gnu"
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录