Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
a017bed3
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
395
Star
4704
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a017bed3
编写于
3月 08, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(fallback): reman general intrinsic type and add more intrinsic
GitOrigin-RevId: 37409bae9a9a0e8c324f546e2ed985566cca18c7
上级
fd6f8e58
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
707 addition
and
290 deletion
+707
-290
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+44
-36
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+301
-128
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+321
-90
dnn/src/fallback/reduce/reducer.h
dnn/src/fallback/reduce/reducer.h
+41
-36
未找到文件。
dnn/src/fallback/general_intrinsic/gi_common.h
浏览文件 @
a017bed3
...
...
@@ -82,29 +82,33 @@
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
typedef
__m256
GI_FLOAT32
;
typedef
__m256i
GI_UINT8
;
typedef
__m256i
GI_INT8
;
typedef
__m256i
GI_INT16
;
typedef
__m256i
GI_INT32
;
typedef
__m256
GI_FLOAT32_t
;
typedef
__m256i
GI_UINT8_t
;
typedef
__m256i
GI_INT8_t
;
typedef
__m256i
GI_INT16_t
;
typedef
__m256i
GI_INT32_t
;
typedef
__m256i
GI_UINT32_t
;
#elif defined(GI_NEON_INTRINSICS)
typedef
float32x4_t
GI_FLOAT32
;
typedef
uint8x16_t
GI_UINT8
;
typedef
int8x16_t
GI_INT8
;
typedef
int16x8_t
GI_INT16
;
typedef
int32x4_t
GI_INT32
;
typedef
float32x4_t
GI_FLOAT32_t
;
typedef
uint8x16_t
GI_UINT8_t
;
typedef
int8x16_t
GI_INT8_t
;
typedef
int16x8_t
GI_INT16_t
;
typedef
int32x4_t
GI_INT32_t
;
typedef
uint32x4_t
GI_UINT32_t
;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
typedef
__m128
GI_FLOAT32
;
typedef
__m128i
GI_UINT8
;
typedef
__m128i
GI_INT8
;
typedef
__m128i
GI_INT16
;
typedef
__m128i
GI_INT32
;
typedef
__m128
GI_FLOAT32_t
;
typedef
__m128i
GI_UINT8_t
;
typedef
__m128i
GI_INT8_t
;
typedef
__m128i
GI_INT16_t
;
typedef
__m128i
GI_INT32_t
;
typedef
__m128i
GI_UINT32_t
;
#else
typedef
float
GI_FLOAT32
__attribute__
((
vector_size
(
16
)));
typedef
uint8_t
GI_UINT8
__attribute__
((
vector_size
(
16
)));
typedef
int8_t
GI_INT8
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT16
__attribute__
((
vector_size
(
16
)));
typedef
int32_t
GI_INT32
__attribute__
((
vector_size
(
16
)));
typedef
float
GI_FLOAT32_t
__attribute__
((
vector_size
(
16
)));
typedef
uint8_t
GI_UINT8_t
__attribute__
((
vector_size
(
16
)));
typedef
int8_t
GI_INT8_t
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT16_t
__attribute__
((
vector_size
(
16
)));
typedef
int32_t
GI_INT32_t
__attribute__
((
vector_size
(
16
)));
typedef
uint32_t
GI_UINT32_t
__attribute__
((
vector_size
(
16
)));
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
...
...
@@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16)));
#define Min(a, b) (a) < (b) ? (a) : (b)
typedef
struct
{
GI_INT32
val
[
2
];
}
GI_INT32_V2
;
GI_INT32
_t
val
[
2
];
}
GI_INT32_V2
_t
;
typedef
struct
{
GI_INT32
val
[
4
];
}
GI_INT32_V4
;
GI_INT32
_t
val
[
4
];
}
GI_INT32_V4
_t
;
typedef
struct
{
GI_FLOAT32
val
[
2
];
}
GI_FLOAT32_V2
;
GI_FLOAT32
_t
val
[
2
];
}
GI_FLOAT32_V2
_t
;
typedef
struct
{
GI_FLOAT32
val
[
4
];
}
GI_FLOAT32_V4
;
GI_FLOAT32_t
val
[
4
];
}
GI_FLOAT32_V4_t
;
typedef
struct
{
GI_INT16_t
val
[
2
];
}
GI_INT16_V2_t
;
typedef
struct
{
GI_INT8_t
val
[
2
];
}
GI_INT8_V2_t
;
GI_FORCEINLINE
GI_INT32
GiAndInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiAndInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiOrInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiOrInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vorrq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiAndNotInt32
(
GI_INT32
VectorNot
,
GI_INT32
Vector
)
{
GI_INT32_t
GiAndNotInt32
(
GI_INT32_t
VectorNot
,
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
vmvnq_s32
(
VectorNot
),
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiXorInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiXorInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
dnn/src/fallback/general_intrinsic/gi_float.h
浏览文件 @
a017bed3
...
...
@@ -14,20 +14,51 @@
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiReinterpretAsInt32
(
GI_FLOAT32
In
)
{
GI_INT32_t
GiReinterpretAsInt32
(
GI_FLOAT32_t
In
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_s32_f32
(
In
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
In
);
#else
return
GI_INT32
(
In
);
return
*
(
GI_INT32_t
*
)(
&
In
);
#endif
}
GI_FORCEINLINE
GI_INT32
GiRoundAsInt32
(
GI_FLOAT32
Vector
)
{
GI_UINT32_t
GiReinterpretAsUint32
(
GI_FLOAT32_t
In
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_u32_f32
(
In
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
In
);
#else
return
*
(
GI_UINT32_t
*
)(
&
In
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiReintInt32ToFloat32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
*
(
GI_FLOAT32_t
*
)(
&
Vector
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiReintUint32ToFloat32
(
GI_UINT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_u32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
*
(
GI_FLOAT32_t
*
)(
&
Vector
);
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiRoundAsInt32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
return
vcvtaq_s32_f32
(
Vector
);
...
...
@@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) {
return
_mm_castps_si128
(
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
));
#else
GI_INT32
ret
;
GI_INT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
(
int32_t
)
round
(
Vector
[
i
]);
}
...
...
@@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
GI_FLOAT32
GiCastToFloat32
(
GI_INT32
Vector
)
{
GI_INT32_t
GiCastToInt32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcvtq_
f32_s
32
(
Vector
);
return
vcvtq_
s32_f
32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_cvt
epi32_ps
(
Vector
);
return
_mm_cvt
tps_epi32
(
Vector
);
#else
GI_
FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_
t
);
i
++
)
{
ret
[
i
]
=
float
(
Vector
[
i
]);
GI_
INT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
floa
t
);
i
++
)
{
ret
[
i
]
=
(
int32_t
)
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiReinterpretAsFloat32
(
GI_INT32
Vector
)
{
GI_FLOAT32_t
GiCastToFloat32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
v
reinterpre
tq_f32_s32
(
Vector
);
return
v
cv
tq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_c
astsi128
_ps
(
Vector
);
return
_mm_c
vtepi32
_ps
(
Vector
);
#else
return
GI_FLOAT32
(
Vector
);
GI_FLOAT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
float
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
float
Value
)
{
GI_FLOAT32_t
GiBroadcastFloat32
(
float
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_ps
(
Value
);
#else
GI_FLOAT32
ret
;
GI_FLOAT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Value
;
}
...
...
@@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) {
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
const
float
*
Value
)
{
GI_FLOAT32_t
GiLoadBroadcastFloat32
(
const
float
*
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_dup_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_load_ps1
(
Value
);
#else
GI_FLOAT32
ret
;
GI_FLOAT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
*
Value
;
}
...
...
@@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) {
}
GI_FORCEINLINE
GI_FLOAT32
GiZeroFloat32
(
void
)
{
GI_FLOAT32_t
GiZeroFloat32
(
void
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
0.0
f
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -128,14 +158,13 @@ GiZeroFloat32(void) {
}
GI_FORCEINLINE
GI_FLOAT32
GiLoadFloat32
(
const
float
*
Buffer
)
{
GI_FLOAT32_t
GiLoadFloat32
(
const
float
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_f32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_ps
(
Buffer
);
#else
GI_FLOAT32
ret
;
GI_FLOAT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
...
...
@@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) {
}
GI_FORCEINLINE
void
GiStoreFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
void
GiStoreFloat32
(
float
*
Buffer
,
GI_FLOAT32
_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) {
#endif
}
GI_FORCEINLINE
void
GiStoreAlignedFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_store_ps
(
Buffer
,
Vector
);
#else
GiStoreFloat32
(
Buffer
,
Vector
);
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
vst1q_lane_f32(Buffer, Vector, i); \
#define GISTORELANEFLOAT32(i)
\
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32
_t
Vector) { \
vst1q_lane_f32(Buffer, Vector, i);
\
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32
Vector) {
\
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32
_t Vector) {
\
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
*Buffer = Vector[i]; \
#define GISTORELANEFLOAT32(i)
\
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32
_t
Vector) { \
*Buffer = Vector[i];
\
}
#endif
...
...
@@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3)
#undef GISTORELANEFLOAT32
#if defined(GI_NEON_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return vgetq_lane_f32(Vector, i); \
#define GIEXTRACTLANEFLOAT32(i)
\
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32
_t
Vector) { \
return vgetq_lane_f32(Vector, i);
\
}
#elif defined(GI_SSE2_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32
Vector) {
\
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32
_t Vector) {
\
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return Vector[i]; \
#define GIEXTRACTLANEFLOAT32(i)
\
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32
_t
Vector) { \
return Vector[i];
\
}
#endif
...
...
@@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3)
#undef GIEXTRACTLANEFLOAT32
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveLowFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiInterleaveLowFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip1q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpacklo_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
GI_FLOAT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
i
];
...
...
@@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveHighFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiInterleaveHighFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip2q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpackhi_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
GI_FLOAT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
...
...
@@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiAddFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiAddFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiSubtractFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiSubtractFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiMultiplyFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyScalerFloat32
(
GI_FLOAT32
Vector1
,
float
Scaler
)
{
GI_FLOAT32_t
GiMultiplyScalerFloat32
(
GI_FLOAT32_t
Vector1
,
float
Scaler
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_n_f32
(
Vector1
,
Scaler
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32
Vector2
=
_mm_set1_ps
(
Scaler
);
GI_FLOAT32
_t
Vector2
=
_mm_set1_ps
(
Scaler
);
return
_mm_mul_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Scaler
;
...
...
@@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddVecFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32
_t
GiMultiplyAddFloat32
(
GI_FLOAT32_t
VectorSum
,
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return
vfmaq_f32
(
VectorSum
,
Vector1
,
Vector2
);
#else
return
vmlaq_f32
(
VectorSum
,
Vector1
,
Vector2
);
#endif
#elif defined(GI_FMA3_INTRINSICS)
return
_mm_fmadd_ps
(
Vector1
,
Vector2
,
VectorSum
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddScalarFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector
,
float
Scalar
)
{
GI_FLOAT32
_t
GiMultiplySubFloat32
(
GI_FLOAT32_t
VectorSum
,
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_n_f32
(
VectorSum
,
Vector
,
Scalar
);
return
vmlsq_f32
(
VectorSum
,
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_ps
(
VectorSum
,
_mm_mul_ps
(
Vector1
,
Vector2
));
#else
return
VectorSum
-
Vector1
*
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiMultiplyAddScalarFloat32
(
GI_FLOAT32_t
VectorSum
,
GI_FLOAT32_t
Vector
,
float
Scalar
)
{
#if defined(GI_NEON_INTRINSICS)
#if defined(__ARM_FEATURE_FMA)
return
vfmaq_n_f32
(
VectorSum
,
Vector
,
Scalar
);
#else
return
vfmla_n_f32
(
VectorSum
,
Vector
,
Scalar
);
#endif
#elif defined(GI_SSE2_INTRINSICS)
return
GiMultiplyAdd
Vec
Float32
(
VectorSum
,
GiBroadcastFloat32
(
Scalar
),
Vector
);
return
GiMultiplyAddFloat32
(
VectorSum
,
GiBroadcastFloat32
(
Scalar
),
Vector
);
#else
return
VectorSum
+
Vector
*
Scalar
;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
#if defined(__ARM_FEATURE_FMA)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#endif
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32
GiMultiplyAddLan##i##Float32(
\
GI_FLOAT32
VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32
Vector2) { \
return GiMultiplyAddScalarFloat32( \
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
#define GIMULTIPLYADDLANFLOAT32(i)
\
GI_FORCEINLINE GI_FLOAT32
_t GiMultiplyAddLan##i##Float32(
\
GI_FLOAT32
_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t
Vector2) { \
return GiMultiplyAddScalarFloat32(
\
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2));
\
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
...
...
@@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32
GiMultiplyAddLan##i##Float32(
\
GI_FLOAT32
VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32
Vector2) { \
return VectorSum + Vector1 * Vector2[i]; \
#define GIMULTIPLYADDLANFLOAT32(i)
\
GI_FORCEINLINE GI_FLOAT32
_t GiMultiplyAddLan##i##Float32(
\
GI_FLOAT32
_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t
Vector2) { \
return VectorSum + Vector1 * Vector2[i];
\
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
...
...
@@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3)
#endif
GI_FORCEINLINE
GI_FLOAT32
GiDivideFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiDivideFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vdivq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiGreaterThanFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiRecpeSFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vrecpsq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t
two
=
_mm_set1_ps
(
2.0
f
);
return
_mm_sub_ps
(
two
,
_mm_mul_ps
(
Vector1
,
Vector2
));
#else
return
(
2.0
f
-
Vector1
*
Vector2
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiRecpeFloat32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON32_INTRINSICS)
return
vrecpeq_f32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t
ones
=
_mm_set1_ps
(
1.0
f
);
return
_mm_div_ps
(
ones
,
Vector
);
#else
return
1
/
Vector
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiNegFloat32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON32_INTRINSICS)
return
vnegq_f32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32_t
zero
=
_mm_set1_ps
(
0.0
f
);
return
_mm_sub_ps
(
zero
,
Vector
);
#else
return
-
Vector
;
#endif
}
GI_FORCEINLINE
GI_UINT32_t
GiGreaterThanFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcgtq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
_mm_cmpgt_ps
(
Vector1
,
Vector2
));
#else
GI_UINT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Vector1
[
i
]
>
Vector2
[
i
]
?
0xFFFFFFFF
:
0
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_UINT32_t
GiLessThanEqFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
v
reinterpretq_f32_u32
(
vcgtq_f32
(
Vector1
,
Vector2
)
);
return
v
cleq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_c
mpgt_ps
(
Vector1
,
Vector2
);
return
_mm_c
astps_si128
(
_mm_cmple_ps
(
Vector1
,
Vector2
)
);
#else
return
Vector1
>
Vector2
;
GI_UINT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Vector1
[
i
]
<=
Vector2
[
i
]
?
0xFFFFFFFF
:
0
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_UINT32_t
GiLessThanFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcltq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
_mm_cmplt_ps
(
Vector1
,
Vector2
));
#else
GI_UINT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Vector1
[
i
]
<
Vector2
[
i
]
?
0xFFFFFFFF
:
0
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiAndFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_and_ps
(
Vector1
,
Vector2
);
#else
return
GiReint
erpretAs
Float32
(
return
GiReint
Int32To
Float32
(
GiAndInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiOrFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiOrFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_or_ps
(
Vector1
,
Vector2
);
#else
return
GiReint
erpretAs
Float32
(
return
GiReint
Int32To
Float32
(
GiOrInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndNotFloat32
(
GI_FLOAT32
VectorNot
,
GI_FLOAT32
Vector
)
{
GI_FLOAT32_t
GiAndNotFloat32
(
GI_FLOAT32_t
VectorNot
,
GI_FLOAT32_t
Vector
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_ps
(
VectorNot
,
Vector
);
#else
return
GiReint
erpretAs
Float32
(
GiAndNotInt32
(
return
GiReint
Int32To
Float32
(
GiAndNotInt32
(
GiReinterpretAsInt32
(
VectorNot
),
GiReinterpretAsInt32
(
Vector
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiXorFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiXorFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_xor_ps
(
Vector1
,
Vector2
);
#else
return
GiReint
erpretAs
Float32
(
return
GiReint
Int32To
Float32
(
GiXorInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBlendFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
,
GI_FLOAT32
Selection
)
{
GI_FLOAT32
_t
GiBlendFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
,
GI_FLOAT32_t
Selection
)
{
return
GiOrFloat32
(
GiAndFloat32
(
Vector2
,
Selection
),
GiAndNotFloat32
(
Selection
,
Vector1
));
}
...
...
@@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) {
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiBSLFloat32
(
GI_UINT32_t
Selection
,
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vbslq_f32
(
Selection
,
Vector1
,
Vector2
);
#else
return
GiBlendFloat32
(
Vector1
,
Vector2
,
GiReintUint32ToFloat32
(
Selection
));
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiMaximumFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
return
_mm_max_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32_t
max
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
max
[
i
]
=
Max
(
Vector1
[
i
],
Vector2
[
i
]);
}
return
max
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiMinimumFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
return
_mm_min_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32_t
min
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
min
[
i
]
=
Min
(
Vector1
[
i
],
Vector2
[
i
]);
}
return
min
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiMaxNanFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f32
(
Vector1
,
Vector2
);
#else
//! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
GI_FLOAT32
max
;
#define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
GI_FLOAT32_t
max
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
max
[
i
]
=
MAX_NAN
(
Vector1
[
i
],
Vector2
[
i
]);
}
...
...
@@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiMinimumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
GI_FLOAT32_t
GiMinNanFloat32
(
GI_FLOAT32_t
Vector1
,
GI_FLOAT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f32
(
Vector1
,
Vector2
);
#else
//! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
//! implement by C code
GI_FLOAT32
min
;
#define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
GI_FLOAT32_t
min
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
min
[
i
]
=
MIN_NAN
(
Vector1
[
i
],
Vector2
[
i
]);
}
...
...
@@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) {
}
GI_FORCEINLINE
GI_FLOAT32
GiClampFloat32
(
GI_FLOAT32
Value
,
float
LowerRange
,
float
UpperRange
)
{
GI_FLOAT32_t
GiClampFloat32
(
GI_FLOAT32_t
Value
,
float
LowerRange
,
float
UpperRange
)
{
Value
=
GiMaximumFloat32
(
GiBroadcastFloat32
(
LowerRange
),
Value
);
Value
=
GiMinimumFloat32
(
GiBroadcastFloat32
(
UpperRange
),
Value
);
return
Value
;
}
GI_FORCEINLINE
float
GiReduceAddFloat32
(
GI_FLOAT32
Vector
)
{
float
GiReduceAddFloat32
(
GI_FLOAT32
_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
...
...
@@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
float
GiReduceMultiplyFloat32
(
GI_FLOAT32
Vector
)
{
float
GiReduceMultiplyFloat32
(
GI_FLOAT32
_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
float32x2_t
low
=
vget_low_f32
(
Vector
);
float32x2_t
high
=
vget_high_f32
(
Vector
);
...
...
@@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) {
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
float
GiReduceMax
imumFloat32
(
GI_FLOAT32
Vector
)
{
float
GiReduceMax
NanFloat32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vmaxvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMax
imum
Float32
(
Vector
=
GiMax
Nan
Float32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMax
imum
Float32
(
Vector
=
GiMax
Nan
Float32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
...
...
@@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) {
}
GI_FORCEINLINE
float
GiReduceMin
imumFloat32
(
GI_FLOAT32
Vector
)
{
float
GiReduceMin
NanFloat32
(
GI_FLOAT32_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vminvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
VectorLow
=
vpmin_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMin
imum
Float32
(
Vector
=
GiMin
Nan
Float32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMin
imum
Float32
(
Vector
=
GiMin
Nan
Float32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
...
...
@@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) {
#endif
}
GI_FORCEINLINE
GI_FLOAT32_t
GiAbsFloat32
(
GI_FLOAT32_t
Vector1
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vabsq_f32
(
Vector1
);
#elif defined(GI_SSE2_INTRINSICS)
union
{
unsigned
int
int_val
;
float
float_val
;
}
value
;
value
.
int_val
=
0x7fffffff
;
return
_mm_and_ps
(
Vector1
,
_mm_set_ps1
(
value
.
float_val
));
#else
GI_FLOAT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Vector1
[
i
]
>
0
?
Vector1
[
i
]
:
-
Vector1
[
i
];
}
return
ret
;
#endif
}
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_int.h
浏览文件 @
a017bed3
...
...
@@ -14,14 +14,13 @@
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiBroadcastInt32
(
int32_t
Value
)
{
GI_INT32_t
GiBroadcastInt32
(
int32_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_s32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_epi32
(
Value
);
#else
GI_INT32
ret
;
GI_INT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Value
;
}
...
...
@@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) {
}
GI_FORCEINLINE
GI_INT8
GiBroadcastInt8
(
int8_t
Value
)
{
GI_UINT32_t
GiBroadcastUint32
(
int32_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_u32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_epi32
(
Value
);
#else
GI_UINT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiBroadcastInt8
(
int8_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_s8
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_epi8
(
Value
);
#else
GI_INT8
ret
;
GI_INT8
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Value
;
}
...
...
@@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) {
}
GI_FORCEINLINE
GI_INT32
GiLoadInt32
(
const
int32_t
*
Buffer
)
{
GI_INT32_t
GiLoadInt32
(
const
int32_t
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_s32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
#else
GI_INT32
ret
;
GI_INT32
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
...
...
@@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) {
}
GI_FORCEINLINE
GI_INT8
GiLoadInt8
(
const
int8_t
*
Buffer
)
{
GI_INT8_t
GiLoadInt8
(
const
int8_t
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_s8
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
#else
GI_INT8
ret
;
GI_INT8
_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
...
...
@@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) {
}
GI_FORCEINLINE
void
GiStoreInt32
(
int32_t
*
Buffer
,
GI_INT32
Vector
)
{
void
GiStoreInt32
(
int32_t
*
Buffer
,
GI_INT32
_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_s32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) {
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
vst1q_lane_s32(Buffer, Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \
_mm_store_ss( \
(float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEINT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \
*Buffer = Vector[i]; \
}
#endif
GISTORELANEINT32
(
0
)
GISTORELANEINT32
(
1
)
GISTORELANEINT32
(
2
)
GISTORELANEINT32
(
3
)
#undef GISTORELANEFLOAT32
GI_FORCEINLINE
GI_INT8_t
GiReinterInt32ToInt8
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_s8_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
Vector
;
#else
return
*
(
GI_INT8_t
*
)
&
Vector
;
#endif
}
GI_FORCEINLINE
void
GiStoreInt16
(
int16_t
*
Buffer
,
GI_INT16_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_s16
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_si128
((
__m128i
*
)
Buffer
,
Vector
);
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int16_t
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
void
GiStoreInt8
(
int8_t
*
Buffer
,
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_s8
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
void
GiStoreLowInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
void
GiStoreLowInt8
(
int8_t
*
Buffer
,
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1_s8
(
Buffer
,
vget_low_s8
(
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
void
GiStoreHihgInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
void
GiStoreHihgInt8
(
int8_t
*
Buffer
,
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1_s8
(
Buffer
,
vget_high_s8
(
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiAddInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiNegInt32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON32_INTRINSICS)
return
vnegq_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT32_t
zero
=
_mm_set1_epi32
(
0
);
return
_mm_sub_epi32
(
zero
,
Vector
);
#else
return
-
Vector
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiNegInt8
(
GI_INT8_t
Vector
)
{
#if defined(GI_NEON32_INTRINSICS)
return
vnegq_s8
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
GI_INT32_t
zero
=
_mm_set1_epi8
(
0
);
return
_mm_sub_epi8
(
zero
,
Vector
);
#else
return
-
Vector
;
#endif
}
GI_FORCEINLINE
GI_UINT32_t
GiTestAndSetUint32
(
GI_UINT32_t
Vector1
,
GI_UINT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vtstq_u32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
GI_UINT32_t
tmp
=
_mm_and_si128
(
Vector1
,
Vector2
);
return
_mm_cmpeq_epi32
(
tmp
,
_mm_setzero_si128
());
#else
GI_UINT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Vector1
[
i
]
&
Vector2
[
i
]
?
0xFFFFFFFF
:
0
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiAddInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiSubtractInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_UINT32_t
GiAddUint32
(
GI_UINT32_t
Vector1
,
GI_UINT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_u32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi32
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT16_t
GiAddInt16
(
GI_INT16_t
Vector1
,
GI_INT16_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_s16
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi16
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiAddInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi8
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiSubtractInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiMultiplyInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_UINT32_t
GiSubtractUint32
(
GI_UINT32_t
Vector1
,
GI_UINT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_u32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_epi32
(
Vector1
,
Vector2
);
#else
return
Vector1
-
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiSubtractInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_epi8
(
Vector1
,
Vector2
);
#else
return
Vector1
-
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiMultiplyInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_mul_epi32
(
Vector1
,
Vector2
);
GI_FLOAT32_t
v0
=
_mm_cvtepi32_ps
(
Vector1
);
GI_FLOAT32_t
v1
=
_mm_cvtepi32_ps
(
Vector2
);
return
_mm_cvttps_epi32
(
_mm_mul_ps
(
v0
,
v1
));
#else
return
Vector1
*
Vector2
;
#endif
}
//! in x86, there is no int multiply, so implement it naive
GI_FORCEINLINE
GI_INT8_t
GiMultiplyInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
int8_t
v1
[
16
],
v2
[
16
],
res
[
16
];
_mm_storeu_si128
((
__m128i
*
)
v1
,
Vector1
);
_mm_storeu_si128
((
__m128i
*
)
v2
,
Vector2
);
for
(
size_t
id
=
0
;
id
<
16
;
id
++
)
{
res
[
id
]
=
v1
[
id
]
*
v2
[
id
];
}
return
_mm_loadu_si128
((
__m128i
*
)
res
);
#else
return
Vector1
*
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiAndInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
GI_INT32_t
GiMultiplyAddInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
,
GI_INT32_t
Vector3
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_s32
(
Vector1
,
Vector2
,
Vector3
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi32
(
Vector1
,
GiMultiplyInt32
(
Vector2
,
Vector3
));
#else
return
Vector1
+
Vector2
*
Vector3
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiMultiplyAddInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
,
GI_INT8_t
Vector3
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_s8
(
Vector1
,
Vector2
,
Vector3
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi8
(
Vector1
,
GiMultiplyInt8
(
Vector2
,
Vector3
));
#else
return
Vector1
+
Vector2
*
Vector3
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiAndInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiOrInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
GI_UINT32_t
GiEOrUint32
(
GI_UINT32_t
Vector1
,
GI_UINT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_u32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_xor_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
^
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiOrInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vorrq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiAndNotInt8
(
GI_INT8
VectorNot
,
GI_INT8
Vector
)
{
GI_INT8_t
GiAndNotInt8
(
GI_INT8_t
VectorNot
,
GI_INT8_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s8
(
vmvnq_s8
(
VectorNot
),
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_si128
(
VectorNot
,
Vector
);
#else
GI_INT8
Not
=
~
VectorNot
;
GI_INT8
_t
Not
=
~
VectorNot
;
return
(
Not
&
Vector
);
#endif
}
GI_FORCEINLINE
GI_INT8
GiXorInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
GI_INT8_t
GiXorInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
...
...
@@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiShiftLeft23Int32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return vshlq_n_s32(Vector, i); \
}
return
vshlq_n_s32
(
Vector
,
23
);
#elif defined(GI_SSE2_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return _mm_slli_epi32(Vector, i); \
}
return
_mm_slli_epi32
(
Vector
,
23
);
#else
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return Vector << i; \
}
return
Vector
<<
23
;
#endif
}
GISHIFTLEFTINT32
(
0
)
GISHIFTLEFTINT32
(
1
)
GISHIFTLEFTINT32
(
2
)
GISHIFTLEFTINT32
(
3
)
#undef GISHIFTLEFTINT32
GI_FORCEINLINE
GI_INT32_t
GiShiftRight23Int32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vshrq_n_s32
(
Vector
,
23
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_srai_epi32
(
Vector
,
23
);
#else
return
Vector
>>
23
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiBlendInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
,
GI_INT32
Selection
)
{
GI_INT32_t
GiBlendInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
,
GI_INT32_t
Selection
)
{
return
GiOrInt32
(
GiAndInt32
(
Vector2
,
Selection
),
GiAndNotInt32
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
,
GI_INT8
Selection
)
{
GI_INT8_t
GiBlendInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
,
GI_INT8_t
Selection
)
{
return
GiOrInt8
(
GiAndInt8
(
Vector2
,
Selection
),
GiAndNotInt8
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT32
GiMaximumInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiAbsInt32
(
GI_INT32_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vabsq_s32
(
Vector
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_abs_epi32
(
Vector
);
#else
GI_INT32_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
i
]
>
0
?
Vector
[
i
]
:
-
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT16_t
GiAbsInt16
(
GI_INT16_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vabsq_s16
(
Vector
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_abs_epi16
(
Vector
);
#else
GI_INT16_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int16_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
i
]
>
0
?
Vector
[
i
]
:
-
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8_t
GiAbsInt8
(
GI_INT8_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vabsq_s8
(
Vector
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_abs_epi8
(
Vector
);
#else
GI_INT8_t
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
i
]
>
0
?
Vector
[
i
]
:
-
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT32_t
GiMaximumInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT32
GiMinimumInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
GI_INT32_t
GiMinimumInt32
(
GI_INT32_t
Vector1
,
GI_INT32_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8x16
(
GI_INT8
Vector1
,
GI_INT8
Vector2
,
GI_INT8
Selection
)
{
GI_INT8_t
GiBlendInt8x16
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
,
GI_INT8_t
Selection
)
{
return
GiOrInt8
(
GiAndInt8
(
Vector2
,
Selection
),
GiAndNotInt8
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT8
GiMaximumInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
GI_INT8_t
GiMaximumInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT8
GiMinimumInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
GI_INT8_t
GiMinimumInt8
(
GI_INT8_t
Vector1
,
GI_INT8_t
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) {
}
GI_FORCEINLINE
GI_INT16
GiMoveHighLongInt8
(
GI_INT8
Vector
)
{
GI_INT16_t
GiMoveHighLongInt8
(
GI_INT8_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s8
(
vget_high_s8
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
}
return
_mm_loadu_si128
((
__m128i
*
)
data
);
#else
GI_INT16
ret
;
GI_INT16
_t
ret
;
int8_t
*
data
=
(
int8_t
*
)
&
Vector
;
size_t
half_length
=
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
for
(
size_t
i
=
0
;
i
<
half_length
;
i
++
)
{
...
...
@@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT16
GiMoveLowLongInt8
(
GI_INT8
Vector
)
{
GI_INT16_t
GiMoveLowLongInt8
(
GI_INT8_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s8
(
vget_low_s8
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
}
return
_mm_loadu_si128
((
__m128i
*
)
data
);
#else
GI_INT16
ret
;
GI_INT16
_t
ret
;
size_t
half_length
=
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
for
(
size_t
i
=
0
;
i
<
half_length
;
i
++
)
{
ret
[
i
]
=
Vector
[
i
];
...
...
@@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiMoveHighLongInt16
(
GI_INT16
Vector
)
{
GI_INT32_t
GiMoveHighLongInt16
(
GI_INT16_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s16
(
vget_high_s16
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
}
return
_mm_loadu_si128
((
__m128i
*
)
data
);
#else
GI_INT32
ret
;
GI_INT32
_t
ret
;
size_t
half_length
=
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int16_t
);
for
(
size_t
i
=
0
;
i
<
half_length
;
i
++
)
{
ret
[
i
]
=
Vector
[
half_length
+
i
];
...
...
@@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
GI_INT32
GiMoveLowLongInt16
(
GI_INT16
Vector
)
{
GI_INT32_t
GiMoveLowLongInt16
(
GI_INT16_t
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s16
(
vget_low_s16
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
...
...
@@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
return
_mm_loadu_si128
((
__m128i
*
)
data
);
#else
GI_INT32
ret
;
GI_INT32
_t
ret
;
size_t
half_length
=
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int16_t
);
for
(
size_t
i
=
0
;
i
<
half_length
;
i
++
)
{
ret
[
i
]
=
Vector
[
i
];
...
...
@@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) {
}
GI_FORCEINLINE
int32_t
GiReduceAddInt8
(
GI_INT8
Vector
)
{
int32_t
GiReduceAddInt8
(
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vaddlvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
int8_t
GiReduceMaxInt8
(
GI_INT8
Vector
)
{
int8_t
GiReduceMaxInt8
(
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vmaxvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) {
}
GI_FORCEINLINE
int8_t
GiReduceMinInt8
(
GI_INT8
Vector
)
{
int8_t
GiReduceMinInt8
(
GI_INT8
_t
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vminvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
...
...
@@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) {
//! convert to the short type with the lower bit fill the real data, the high bite
//! will repeat the lower bit
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32ToInt8
(
GI_FLOAT32
src
)
{
GI_INT8_t
GiCvtFromFloat32ToInt8
(
GI_FLOAT32_t
src
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
src
);
...
...
@@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16
,
vepi16
);
return
vepi8
;
#else
GI_INT8
ret
;
GI_INT8
_t
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
int8_t
data
=
Saturate
(
round
(
src
[
i
]),
-
128
,
127
);
...
...
@@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) {
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V2ToInt8
(
GI_FLOAT32_V2
vsrc
)
{
GI_INT8_t
GiCvtFromFloat32V2ToInt8
(
GI_FLOAT32_V2_t
vsrc
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
vsrc
.
val
[
0
]);
...
...
@@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) {
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16_0
,
vepi16_0
);
return
vepi8
;
#else
GI_INT8
ret
;
GI_INT8
_t
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
2
*
length
;
i
++
)
{
ret
[
i
]
=
Saturate
(
round
(
vsrc
.
val
[
i
/
length
][
i
%
length
]),
-
128
,
127
);
...
...
@@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) {
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V4ToInt8
(
GI_FLOAT32_V4
vsrc
)
{
GI_INT8_t
GiCvtFromFloat32V4ToInt8
(
GI_FLOAT32_V4_t
vsrc
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
vsrc
.
val
[
0
]);
...
...
@@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) {
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16_0
,
vepi16_1
);
return
vepi8
;
#else
GI_INT8
ret
;
GI_INT8
_t
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
4
*
length
;
i
++
)
{
ret
[
i
]
=
Saturate
(
round
(
vsrc
.
val
[
i
/
length
][
i
%
length
]),
-
128
,
127
);
...
...
dnn/src/fallback/reduce/reducer.h
浏览文件 @
a017bed3
...
...
@@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
using
ctype
=
int8_t
;
static
constexpr
int
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
GI_INT32
res
[
4
];
GI_INT32
_t
res
[
4
];
int32_t
remain
;
int32_t
cnt
;
float
coef
;
GI_FLOAT32
vcoef
;
GI_FLOAT32
_t
vcoef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
remain
(
0
),
cnt
(
cnt
),
coef
(
1.0
/
cnt
)
{
memset
(
res
,
0
,
sizeof
(
res
));
vcoef
=
GiBroadcastFloat32
(
coef
);
}
MeanReducer
()
=
default
;
void
feed
(
const
int8_t
*
val
)
{
const
GI_INT8
vval
=
GiLoadInt8
(
val
);
const
GI_INT16
vval_low
=
GiMoveLowLongInt8
(
vval
);
const
GI_INT16
vval_high
=
GiMoveHighLongInt8
(
vval
);
const
GI_INT8
_t
vval
=
GiLoadInt8
(
val
);
const
GI_INT16
_t
vval_low
=
GiMoveLowLongInt8
(
vval
);
const
GI_INT16
_t
vval_high
=
GiMoveHighLongInt8
(
vval
);
const
GI_INT32
vval_low_low
=
GiMoveLowLongInt16
(
vval_low
);
const
GI_INT32
vval_low_high
=
GiMoveHighLongInt16
(
vval_low
);
const
GI_INT32
vval_high_low
=
GiMoveLowLongInt16
(
vval_high
);
const
GI_INT32
vval_high_high
=
GiMoveHighLongInt16
(
vval_high
);
const
GI_INT32
_t
vval_low_low
=
GiMoveLowLongInt16
(
vval_low
);
const
GI_INT32
_t
vval_low_high
=
GiMoveHighLongInt16
(
vval_low
);
const
GI_INT32
_t
vval_high_low
=
GiMoveLowLongInt16
(
vval_high
);
const
GI_INT32
_t
vval_high_high
=
GiMoveHighLongInt16
(
vval_high
);
res
[
0
]
=
GiAddInt32
(
res
[
0
],
vval_low_low
);
res
[
1
]
=
GiAddInt32
(
res
[
1
],
vval_low_high
);
...
...
@@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> {
void
feed_remain
(
const
int8_t
*
val
)
{
remain
+=
*
val
;
}
void
post
(
int8_t
*
dst
)
{
for
(
int
i
=
0
;
i
<
4
;
i
+=
2
)
{
GI_FLOAT32
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
res
[
i
]),
vcoef
);
GI_FLOAT32
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
res
[
i
+
1
]),
vcoef
);
GI_FLOAT32
_t
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
res
[
i
]),
vcoef
);
GI_FLOAT32
_t
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
res
[
i
+
1
]),
vcoef
);
GiStoreLowInt8
(
dst
,
(
QConverter
::
convert
<
GI_INT8
,
GI_FLOAT32_V2
>
(
{{
vitem0
,
vitem1
}})));
dst
,
(
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
{{
vitem0
,
vitem1
}})));
dst
+=
8
;
}
}
...
...
@@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> {
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
GI_FLOAT32
res
;
GI_FLOAT32
_t
res
;
float
result
;
float
coef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
result
(
0.0
f
),
coef
(
1.0
/
cnt
)
{
...
...
@@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> {
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
GI_FLOAT32
res
;
GI_FLOAT32
_t
res
;
float
remain
;
float
coef
;
MeanReducer
(
DType
,
size_t
cnt
)
:
remain
(
0.0
f
),
coef
(
1.0
/
cnt
)
{
...
...
@@ -140,30 +140,33 @@ struct minReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32
res;
\
GI_FLOAT32
_t res;
\
_mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \
_mode##Reducer() = default; \
void feed(const float* val) { \
auto vval = GiLoadFloat32(val); \
res = Gi##_Mode##
imumFloat32(res, vval);
\
res = Gi##_Mode##
NanFloat32(res, vval);
\
} \
void feed_remain(const float* val) { \
auto vval = GiBroadcastFloat32(*val); \
res = Gi##_Mode##
imumFloat32(vval, res);
\
res = Gi##_Mode##
NanFloat32(vval, res);
\
} \
void post(float* dst) { *dst = GiReduce##_Mode##
imumFloat32(res); }
\
void post(float* dst) { *dst = GiReduce##_Mode##
NanFloat32(res); }
\
}
REDUCER_MAX_MIN_C1
(
max
,
Max
,
std
::
numeric_limits
<
dt_float32
>::
lowest
());
REDUCER_MAX_MIN_C1
(
min
,
Min
,
std
::
numeric_limits
<
dt_float32
>::
max
());
#undef REDUCER_MAX_MIN_C1
#define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
#define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
#define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32
res;
\
GI_FLOAT32
_t res;
\
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
...
...
@@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32
vval = GiLoadFloat32(val);
\
res = Gi##_Mode##
imumFloat32(res, vval);
\
GI_FLOAT32
_t vval = GiLoadFloat32(val);
\
res = Gi##_Mode##
NanFloat32(res, vval);
\
} \
void feed_remain(const float* val) { \
using namespace std; \
remain = _
mode(*val, remain);
\
remain = _
Mode##_NAN(*val, remain);
\
} \
void post(float* dst) { GiStoreFloat32(dst, res); } \
void post_remain(float* dst) { *dst = remain; } \
...
...
@@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max());
REDUCER_MAX_MIN_C
(
max
,
Max
,
std
::
numeric_limits
<
dt_float32
>::
lowest
());
REDUCER_MAX_MIN_C
(
min
,
Min
,
std
::
numeric_limits
<
dt_float32
>::
max
());
#undef REDUCER_MAX_MIN_C
#undef Max_NAN
#undef Min_NAN
#define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \
template <> \
struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8
res;
\
GI_INT8
_t res;
\
_mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8
vval = GiLoadInt8(val);
\
GI_INT8
_t vval = GiLoadInt8(val);
\
res = Gi##_Mode##imumInt8(vval, res); \
} \
void feed_remain(const int8_t* val) { \
GI_INT8
vval = GiBroadcastInt8(*val);
\
GI_INT8
_t vval = GiBroadcastInt8(*val);
\
res = Gi##_Mode##imumInt8(res, vval); \
} \
void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \
...
...
@@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \
using ctype = int8_t; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \
GI_INT8
res;
\
GI_INT8
_t res;
\
int8_t remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastInt8(_init); \
...
...
@@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127);
} \
_mode##Reducer() = default; \
void feed(const int8_t* val) { \
GI_INT8
vval = GiLoadInt8(val);
\
GI_INT8
_t vval = GiLoadInt8(val);
\
res = Gi##_Mode##imumInt8(res, vval); \
} \
void feed_remain(const int8_t* val) { \
...
...
@@ -248,7 +253,7 @@ struct ProductReducer;
struct _mode##Reducer<dt_float32, float, float, true> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32
res;
\
GI_FLOAT32
_t res;
\
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
...
...
@@ -256,7 +261,7 @@ struct ProductReducer;
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32
vval = GiLoadFloat32(val);
\
GI_FLOAT32
_t vval = GiLoadFloat32(val);
\
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
...
...
@@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
struct _mode##Reducer<dt_float32, float, float, false> { \
using ctype = float; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
GI_FLOAT32
res;
\
GI_FLOAT32
_t res;
\
float remain; \
_mode##Reducer(DType, size_t) { \
res = GiBroadcastFloat32(_init); \
...
...
@@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f);
} \
_mode##Reducer() = default; \
void feed(const float* val) { \
GI_FLOAT32
vval = GiLoadFloat32(val);
\
GI_FLOAT32
_t vval = GiLoadFloat32(val);
\
res = Gi##_Mode##Float32(vval, res); \
} \
void feed_remain(const float* val) { \
...
...
@@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
GI_FLOAT32
res
;
GI_FLOAT32
_t
res
;
float
result
;
SumSqrReducer
(
DType
,
size_t
cnt
)
:
result
(
0.0
f
)
{
MEGDNN_MARK_USED_VAR
(
cnt
);
...
...
@@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> {
}
SumSqrReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
GI_FLOAT32
vval
=
GiLoadFloat32
(
val
);
GI_FLOAT32
_t
vval
=
GiLoadFloat32
(
val
);
res
=
GiAddFloat32
(
GiMultiplyFloat32
(
vval
,
vval
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
...
...
@@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
using
ctype
=
float
;
static
constexpr
int
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
GI_FLOAT32
res
;
GI_FLOAT32
_t
res
;
float
remain
;
SumSqrReducer
(
DType
,
size_t
cnt
)
:
remain
(
0.0
f
)
{
MEGDNN_MARK_USED_VAR
(
cnt
);
...
...
@@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> {
}
SumSqrReducer
()
=
default
;
void
feed
(
const
float
*
val
)
{
GI_FLOAT32
vval
=
GiLoadFloat32
(
val
);
GI_FLOAT32
_t
vval
=
GiLoadFloat32
(
val
);
res
=
GiAddFloat32
(
GiMultiplyFloat32
(
vval
,
vval
),
res
);
}
void
feed_remain
(
const
float
*
val
)
{
remain
+=
(
*
val
)
*
(
*
val
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录