Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
10f23778
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
10f23778
编写于
2月 17, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(fallback): add simd general intrinsic
GitOrigin-RevId: ad78ba689fe5a1a142964bc508a4da625705cdb8
上级
286051ed
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
1515 addition
and
0 deletion
+1515
-0
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+186
-0
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+596
-0
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+733
-0
未找到文件。
dnn/src/fallback/general_intrinsic/gi_common.h
0 → 100644
浏览文件 @
10f23778
/**
* \file dnn/src/fallback/general_intrinsic/gi_common.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "math.h"
#include "stdint.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#else
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#define GI_TARGET_ARM
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
#define GI_TARGET_X86
#endif
#endif
#ifdef _WIN32
//! GI stand for general intrinsic
#define GI_DECLSPEC_ALIGN(variable, alignment) DECLSPEC_ALIGN(alignment) variable
#else
#define GI_DECLSPEC_ALIGN(variable, alignment) \
variable __attribute__((aligned(alignment)))
#endif
#if defined(_MSC_VER)
#define GI_FORCEINLINE __forceinline
#else
#define GI_FORCEINLINE __attribute__((always_inline)) inline
#endif
#if defined(_MSC_VER)
#define GI_INTERNAL_DATA extern "C"
#else
#define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden")))
#endif
#if defined(GI_TARGET_ARM)
#define GI_NEON_INTRINSICS
#if defined(__aarch64__)
#define GI_NEON64_INTRINSICS
#else
#define GI_NEON32_INTRINSICS
#endif
#elif defined(GI_TARGET_X86)
//#if defined(__FMA__)
//#define GI_FMA_INTRINSICS
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX2__)
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX__)
//#define GI_AVX_INTRINSICS
#if defined(__SSE4_2__)
#define GI_SSE42_INTRINSICS
#define GI_SSE2_INTRINSICS
#elif defined(__SSE2__)
#define GI_SSE2_INTRINSICS
#endif
#endif
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
typedef
__m256
GI_FLOAT32
;
typedef
__m256i
GI_UINT8
;
typedef
__m256i
GI_INT8
;
typedef
__m256i
GI_INT16
;
typedef
__m256i
GI_INT32
;
#elif defined(GI_NEON_INTRINSICS)
typedef
float32x4_t
GI_FLOAT32
;
typedef
uint8x16_t
GI_UINT8
;
typedef
int8x16_t
GI_INT8
;
typedef
int16x8_t
GI_INT16
;
typedef
int32x4_t
GI_INT32
;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
typedef
__m128
GI_FLOAT32
;
typedef
__m128i
GI_UINT8
;
typedef
__m128i
GI_INT8
;
typedef
__m128i
GI_INT16
;
typedef
__m128i
GI_INT32
;
#else
typedef
float
GI_FLOAT32
__attribute__
((
vector_size
(
16
)));
typedef
uint16_t
GI_UINT8
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT8
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT16
__attribute__
((
vector_size
(
16
)));
typedef
int32_t
GI_INT32
__attribute__
((
vector_size
(
16
)));
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
//! if neon and sse the simd lenght is 128
#define GI_SIMD_LEN 256
#define GI_SIMD_LEN_BYTE 32
#elif defined(GI_NEON_INTRINSICS) || defined(GI_SSE2_INTRINSICS) || \
defined(GI_SSE42_INTRINSICS)
#define GI_SIMD_LEN 128
#define GI_SIMD_LEN_BYTE 16
#else
//! if no simd hardware support, the simd is implemented by C, default set to
//! 128
#define GI_SIMD_LEN 128
#define GI_SIMD_LEN_BYTE 16
#endif
typedef
struct
{
GI_INT32
val
[
2
];
}
GI_INT32_V2
;
typedef
struct
{
GI_INT32
val
[
4
];
}
GI_INT32_V4
;
typedef
struct
{
GI_FLOAT32
val
[
2
];
}
GI_FLOAT32_V2
;
typedef
struct
{
GI_FLOAT32
val
[
4
];
}
GI_FLOAT32_V4
;
GI_FORCEINLINE
GI_INT32
GiAndInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_and_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
&
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiOrInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vorrq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_or_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
|
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiAndNotInt32
(
GI_INT32
VectorNot
,
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
vmvnq_s32
(
VectorNot
),
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_si128
(
VectorNot
,
Vector
);
#else
return
(
~
VectorNot
)
&
Vector
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiXorInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_xor_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
^
Vector2
;
#endif
}
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_float.h
0 → 100644
浏览文件 @
10f23778
/**
* \file dnn/src/fallback/general_intrinsic/gi_float.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiReinterpretAsInt32
(
GI_FLOAT32
In
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_s32_f32
(
In
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
In
);
#else
return
GI_INT32
(
In
);
#endif
}
GI_FORCEINLINE
GI_INT32
GiRoundAsInt32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
return
vcvtaq_s32_f32
(
Vector
);
#else
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vfhalf
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vfneg_half
=
vdupq_n_f32
(
-
0.5
f
);
float32x4_t
vinc0
=
vbslq_f32
(
vcgeq_f32
(
Vector
,
vzero
),
vfhalf
,
vfneg_half
);
return
vcvtq_s32_f32
(
vaddq_f32
(
Vector
,
vinc0
));
#endif
#elif defined(GI_SSE2_INTRINSICS)
__m128
vfzero
=
_mm_set1_ps
(
0.
f
);
__m128
vfhalf
=
_mm_set1_ps
(
0.5
f
);
__m128
vfneg_half
=
_mm_set1_ps
(
-
0.5
f
);
__m128
vinc0
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
Vector
,
vfzero
));
__m128
vres0
=
_mm_add_ps
(
Vector
,
vinc0
);
return
_mm_castps_si128
(
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
));
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
(
int32_t
)
round
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiCastToFloat32
(
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcvtq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_cvtepi32_ps
(
Vector
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
float
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiReinterpretAsFloat32
(
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
GI_FLOAT32
(
Vector
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
float
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_ps
(
Value
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
const
float
*
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_dup_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_load_ps1
(
Value
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
*
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiZeroFloat32
(
void
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
0.0
f
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_setzero_ps
();
#else
return
GiBroadcastFloat32
(
0.0
f
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiLoadFloat32
(
const
float
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_f32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_ps
(
Buffer
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
void
GiStoreFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_ps
(
Buffer
,
Vector
);
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreAlignedFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_store_ps
(
Buffer
,
Vector
);
#else
GiStoreFloat32
(
Buffer
,
Vector
);
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
vst1q_lane_f32(Buffer, Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
*Buffer = Vector[i]; \
}
#endif
GISTORELANEFLOAT32
(
0
)
GISTORELANEFLOAT32
(
1
)
GISTORELANEFLOAT32
(
2
)
GISTORELANEFLOAT32
(
3
)
#undef GISTORELANEFLOAT32
#if defined(GI_NEON_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return vgetq_lane_f32(Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return Vector[i]; \
}
#endif
GIEXTRACTLANEFLOAT32
(
0
)
GIEXTRACTLANEFLOAT32
(
1
)
GIEXTRACTLANEFLOAT32
(
2
)
GIEXTRACTLANEFLOAT32
(
3
)
#undef GIEXTRACTLANEFLOAT32
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveLowFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip1q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
zipped
=
vzipq_f32
(
Vector1
,
Vector2
);
return
zipped
.
val
[
0
];
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpacklo_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveHighFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip2q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
zipped
=
vzipq_f32
(
Vector1
,
Vector2
);
return
zipped
.
val
[
1
];
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpackhi_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAddFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiSubtractFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
-
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_mul_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyScalerFloat32
(
GI_FLOAT32
Vector1
,
float
Scaler
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_n_f32
(
Vector1
,
Scaler
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32
Vector2
=
_mm_set1_ps
(
Scaler
);
return
_mm_mul_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Scaler
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddVecFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_f32
(
VectorSum
,
Vector1
,
Vector2
);
#elif defined(GI_FMA3_INTRINSICS)
return
_mm_fmadd_ps
(
Vector1
,
Vector2
,
VectorSum
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_ps
(
_mm_mul_ps
(
Vector1
,
Vector2
),
VectorSum
);
#else
return
Vector1
*
Vector2
+
VectorSum
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddScalarFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector
,
float
Scalar
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_n_f32
(
VectorSum
,
Vector
,
Scalar
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiMultiplyAddVecFloat32
(
VectorSum
,
GiBroadcastFloat32
(
Scalar
),
Vector
);
#else
return
VectorSum
+
Vector
*
Scalar
;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return GiMultiplyAddScalarFloat32( \
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return VectorSum + Vector1 * Vector2[i]; \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#endif
GI_FORCEINLINE
GI_FLOAT32
GiDivideFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vdivq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x4_t
recp
=
vrecpeq_f32
(
Vector2
);
recp
=
vmulq_f32
(
vrecpsq_f32
(
Vector2
,
recp
),
recp
);
return
vmulq_f32
(
Vector1
,
recp
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_div_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
/
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiGreaterThanFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_u32
(
vcgtq_f32
(
Vector1
,
Vector2
));
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_cmpgt_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
>
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_and_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiAndInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiOrFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_or_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiOrInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndNotFloat32
(
GI_FLOAT32
VectorNot
,
GI_FLOAT32
Vector
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_ps
(
VectorNot
,
Vector
);
#else
return
GiReinterpretAsFloat32
(
GiAndNotInt32
(
GiReinterpretAsInt32
(
VectorNot
),
GiReinterpretAsInt32
(
Vector
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiXorFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_xor_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiXorInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBlendFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
,
GI_FLOAT32
Selection
)
{
return
GiOrFloat32
(
GiAndFloat32
(
Vector2
,
Selection
),
GiAndNotFloat32
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_max_ps
(
Vector1
,
Vector2
);
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector1
>
Vector2
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMinimumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_min_ps
(
Vector1
,
Vector2
);
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector2
>
Vector1
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiClampFloat32
(
GI_FLOAT32
Value
,
float
LowerRange
,
float
UpperRange
)
{
Value
=
GiMaximumFloat32
(
GiBroadcastFloat32
(
LowerRange
),
Value
);
Value
=
GiMinimumFloat32
(
GiBroadcastFloat32
(
UpperRange
),
Value
);
return
Value
;
}
GI_FORCEINLINE
float
GiReduceAddFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
return
vgetq_lane_f32
(
Vector
,
0
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpadd_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpadd_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiAddFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiAddFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
0
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
+=
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
float
GiReduceMultiplyFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
float32x2_t
low
=
vget_low_f32
(
Vector
);
float32x2_t
high
=
vget_high_f32
(
Vector
);
float32x2_t
res
=
vmul_f32
(
low
,
high
);
return
vget_lane_f32
(
res
,
0
)
*
vget_lane_f32
(
res
,
1
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMultiplyFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMultiplyFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
1
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
*=
Vector
[
i
];
}
return
ret
;
#endif
}
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
float
GiReduceMaximumFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vmaxvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_VSX_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
GI_FLOAT32
(
vec_splat
((
__vector
long
long
)
Vector
,
1
)));
Vector
=
GiMaximumFloat32
(
Vector
,
vec_splat
(
Vector
,
1
));
return
Vector
[
0
];
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMaximumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
Max
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
float
GiReduceMinimumFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vminvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpmin_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMinimumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMinimumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
Min
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
}
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_int.h
0 → 100644
浏览文件 @
10f23778
/**
* \file dnn/src/fallback/general_intrinsic/gi_float.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiBroadcastInt32
(
int32_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_s32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_epi32
(
Value
);
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiBroadcastInt8
(
int8_t
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_s8
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_epi8
(
Value
);
#else
GI_INT8
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiLoadInt32
(
const
int32_t
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_s32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiLoadInt8
(
const
int8_t
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_s8
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_si128
((
const
__m128i
*
)
Buffer
);
#else
GI_INT8
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
void
GiStoreInt32
(
int32_t
*
Buffer
,
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_s32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_si128
((
__m128i
*
)
Buffer
,
Vector
);
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_s8
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_si128
((
__m128i
*
)
Buffer
,
Vector
);
#else
for
(
int
i
=
0
;
i
<
16
;
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreLowInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1_s8
(
Buffer
,
vget_low_s8
(
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
_mm_storel_epi64
((
__m128i
*
)
Buffer
,
Vector
);
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreHihgInt8
(
int8_t
*
Buffer
,
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1_s8
(
Buffer
,
vget_high_s8
(
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
_mm_storel_epi64
((
__m128i
*
)
Buffer
,
_mm_unpackhi_epi64
(
Vector
,
Vector
));
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
}
#endif
}
GI_FORCEINLINE
GI_INT32
GiAddInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_epi32
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiSubtractInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_epi32
(
Vector1
,
Vector2
);
#else
return
Vector1
-
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiMultiplyInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_mul_epi32
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiAndInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_and_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
&
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiOrInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vorrq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_or_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
|
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiAndNotInt8
(
GI_INT8
VectorNot
,
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s8
(
vmvnq_s8
(
VectorNot
),
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_si128
(
VectorNot
,
Vector
);
#else
return
(
~
VectorNot
)
&
Vector
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiXorInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_xor_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
^
Vector2
;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return vshlq_n_s32(Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return _mm_slli_epi32(Vector, i); \
}
#else
#define GISHIFTLEFTINT32(i) \
GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \
return Vector << i; \
}
#endif
GISHIFTLEFTINT32
(
0
)
GISHIFTLEFTINT32
(
1
)
GISHIFTLEFTINT32
(
2
)
GISHIFTLEFTINT32
(
3
)
#undef GISHIFTLEFTINT32
GI_FORCEINLINE
GI_INT32
GiBlendInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
,
GI_INT32
Selection
)
{
return
GiOrInt32
(
GiAndInt32
(
Vector2
,
Selection
),
GiAndNotInt32
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
,
GI_INT8
Selection
)
{
return
GiOrInt8
(
GiAndInt8
(
Vector2
,
Selection
),
GiAndNotInt8
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT32
GiMaximumInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_max_epi32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiBlendInt32
(
Vector2
,
Vector1
,
_mm_cmpgt_epi32
(
Vector1
,
Vector2
));
#else
return
GiBlendInt32
(
Vector2
,
Vector1
,
Vector1
>
Vector2
);
#endif
}
GI_FORCEINLINE
GI_INT32
GiMinimumInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_min_epi32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiBlendInt32
(
Vector2
,
Vector1
,
_mm_cmpgt_epi32
(
Vector2
,
Vector1
));
#else
return
GiBlendInt32
(
Vector2
,
Vector1
,
Vector2
>
Vector1
);
#endif
}
GI_FORCEINLINE
GI_INT8
GiBlendInt8x16
(
GI_INT8
Vector1
,
GI_INT8
Vector2
,
GI_INT8
Selection
)
{
return
GiOrInt8
(
GiAndInt8
(
Vector2
,
Selection
),
GiAndNotInt8
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_INT8
GiMaximumInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_max_epi8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiBlendInt8
(
Vector2
,
Vector1
,
_mm_cmpgt_epi8
(
Vector1
,
Vector2
));
#else
return
GiBlendInt8
(
Vector2
,
Vector1
,
Vector1
>
Vector2
);
#endif
}
GI_FORCEINLINE
GI_INT8
GiMinimumInt8
(
GI_INT8
Vector1
,
GI_INT8
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_s8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_min_epi8
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiBlendInt8
(
Vector2
,
Vector1
,
_mm_cmpgt_epi8
(
Vector2
,
Vector1
));
#else
return
GiBlendInt8
(
Vector2
,
Vector1
,
Vector2
>
Vector1
);
#endif
}
GI_FORCEINLINE
GI_INT16
GiMoveHighLongInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s8
(
vget_high_s8
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
Vector
,
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
int16_t
data
[
8
];
int8_t
o_data
[
16
];
_mm_storeu_si128
((
__m128i
*
)
o_data
,
Vector
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
data
[
i
]
=
o_data
[
8
+
i
];
}
return
_mm_loadu_si16
(
data
);
#else
GI_INT16
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT16
GiMoveLowLongInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s8
(
vget_low_s8
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_cvtepi8_epi16
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
int16_t
data
[
8
];
int8_t
o_data
[
16
];
_mm_storeu_si128
((
__m128i
*
)
o_data
,
Vector
);
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
data
[
i
]
=
o_data
[
i
];
}
return
_mm_loadu_si16
(
data
);
#else
GI_INT16
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int8_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiMoveHighLongInt16
(
GI_INT16
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s16
(
vget_high_s16
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
Vector
,
Vector
));
#elif defined(GI_SSE2_INTRINSICS)
int32_t
data
[
4
];
int16_t
o_data
[
8
];
_mm_storeu_si128
((
__m128i
*
)
o_data
,
Vector
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
data
[
i
]
=
o_data
[
4
+
i
];
}
return
_mm_loadu_si32
(
data
);
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int16_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiMoveLowLongInt16
(
GI_INT16
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmovl_s16
(
vget_low_s16
(
Vector
));
#elif defined(GI_SSE42_INTRINSICS)
return
_mm_cvtepi16_epi32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
int32_t
data
[
4
];
int16_t
o_data
[
8
];
_mm_storeu_si128
((
__m128i
*
)
o_data
,
Vector
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
data
[
i
]
=
o_data
[
i
];
}
return
_mm_loadu_si32
(
data
);
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
int16_t
);
i
++
)
{
ret
[
i
]
=
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
int16_t
GiReduceAddInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vaddlvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
int32_t
sum
=
vpaddlq_s16
(
vpaddlq_s8
(
Vector
));
return
(
vgetq_lane_s32
(
sum
,
0
)
+
vgetq_lane_s32
(
sum
,
1
)
+
vgetq_lane_s32
(
sum
,
2
)
+
vgetq_lane_s32
(
sum
,
3
));
#elif defined(GI_SSE42_INTRINSICS)
__m128i
v0
=
_mm_cvtepi8_epi16
(
Vector
);
__m128i
v1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
Vector
,
Vector
));
__m128i
sum_int16
=
_mm_add_epi16
(
v0
,
v1
);
__m128i
v0_int32
=
_mm_cvtepi16_epi32
(
sum_int16
);
__m128i
v1_int32
=
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
sum_int16
,
sum_int16
));
__m128i
sum
=
_mm_add_epi32
(
v0_int32
,
v1_int32
);
float
ret
=
_mm_extract_epi32
(
sum
,
0
);
ret
+=
_mm_extract_epi32
(
sum
,
1
);
ret
+=
_mm_extract_epi32
(
sum
,
2
);
ret
+=
_mm_extract_epi32
(
sum
,
3
);
return
(
int16_t
)(
ret
);
#elif defined(GI_SSE2_INTRINSICS)
__m64
low
=
GiGetLowInt8x16
(
Vector
);
__m64
high
=
GiGetHighInt8x16
(
Vector
);
__m128
v0
=
_mm_cvtpi8_ps
(
low
);
__m128
v1
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
low
,
low
));
__m128
v2
=
_mm_cvtpi8_ps
(
high
);
__m128
v3
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
high
,
high
));
__m128
sum0
=
_mm_add_ps
(
v0
,
v1
);
__m128
sum1
=
_mm_add_ps
(
v2
,
v3
);
__m128
sum
=
_mm_add_ps
(
sum0
,
sum1
);
float
ret0
=
_mm_cvtss_f32
(
sum
);
float
ret1
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
float
ret2
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
2
,
2
,
2
,
2
)));
float
ret3
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
3
,
3
,
3
,
3
)));
return
(
int16_t
)(
ret0
+
ret1
+
ret2
+
ret3
);
#else
int32_t
sum
=
0
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
sum
+=
Vector
[
i
];
}
return
sum
;
#endif
}
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
int8_t
GiReduceMaxInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vmaxvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
int8x8_t
VectorLow
=
vget_low_s8
(
Vector
);
int8x8_t
VectorHigh
=
vget_high_s8
(
Vector
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
return
vget_lane_s8
(
VectorLow
,
0
);
#elif defined(GI_SSE42_INTRINSICS)
__m128i
v0
=
_mm_cvtepi8_epi16
(
Vector
);
__m128i
v1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
Vector
,
Vector
));
__m128i
max_int16
=
_mm_max_epi16
(
v0
,
v1
);
__m128i
v0_int32
=
_mm_cvtepi16_epi32
(
max_int16
);
__m128i
v1_int32
=
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
max_int16
,
max_int16
));
__m128i
sum
=
_mm_max_epi32
(
v0_int32
,
v1_int32
);
int
ret
=
_mm_extract_epi32
(
sum
,
0
);
ret
=
Max
(
_mm_extract_epi32
(
sum
,
1
),
ret
);
ret
=
Max
(
_mm_extract_epi32
(
sum
,
2
),
ret
);
ret
=
Max
(
_mm_extract_epi32
(
sum
,
3
),
ret
);
return
(
int8_t
)
ret
;
#elif defined(GI_SSE2_INTRINSICS)
__m64
low
=
GiGetLowInt8x16
(
Vector
);
__m64
high
=
GiGetHighInt8x16
(
Vector
);
__m128
v0
=
_mm_cvtpi8_ps
(
low
);
__m128
v1
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
low
,
low
));
__m128
v2
=
_mm_cvtpi8_ps
(
high
);
__m128
v3
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
high
,
high
));
__m128
sum0
=
_mm_add_ps
(
v0
,
v1
);
__m128
sum1
=
_mm_add_ps
(
v2
,
v3
);
__m128
sum
=
_mm_add_ps
(
sum0
,
sum1
);
float
ret0
=
_mm_cvtss_f32
(
sum
);
float
ret1
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
float
ret2
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
2
,
2
,
2
,
2
)));
float
ret3
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
3
,
3
,
3
,
3
)));
return
(
int8_t
)(
Max
(
Max
(
ret0
,
ret1
),
Max
(
ret2
,
ret3
)));
#else
int8_t
max
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
max
=
Max
(
max
,
Vector
[
i
]);
}
return
max
;
#endif
}
GI_FORCEINLINE
int8_t
GiReduceMinInt8
(
GI_INT8
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vminvq_s8
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
int8x8_t
VectorLow
=
vget_low_s8
(
Vector
);
int8x8_t
VectorHigh
=
vget_high_s8
(
Vector
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_s8
(
VectorLow
,
VectorHigh
);
return
vget_lane_s8
(
VectorLow
,
0
);
#elif defined(GI_SSE42_INTRINSICS)
__m128i
v0
=
_mm_cvtepi8_epi16
(
Vector
);
__m128i
v1
=
_mm_cvtepi8_epi16
(
_mm_unpackhi_epi64
(
Vector
,
Vector
));
__m128i
min_int16
=
_mm_min_epi16
(
v0
,
v1
);
__m128i
v0_int32
=
_mm_cvtepi16_epi32
(
min_int16
);
__m128i
v1_int32
=
_mm_cvtepi16_epi32
(
_mm_unpackhi_epi64
(
min_int16
,
min_int16
));
__m128i
sum
=
_mm_min_epi32
(
v0_int32
,
v1_int32
);
int
ret
=
_mm_extract_epi32
(
sum
,
0
);
ret
=
Min
(
_mm_extract_epi32
(
sum
,
1
),
ret
);
ret
=
Min
(
_mm_extract_epi32
(
sum
,
2
),
ret
);
ret
=
Min
(
_mm_extract_epi32
(
sum
,
3
),
ret
);
return
(
int8_t
)
ret
;
#elif defined(GI_SSE2_INTRINSICS)
__m64
low
=
GiGetLowInt8x16
(
Vector
);
__m64
high
=
GiGetHighInt8x16
(
Vector
);
__m128
v0
=
_mm_cvtpi8_ps
(
low
);
__m128
v1
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
low
,
low
));
__m128
v2
=
_mm_cvtpi8_ps
(
high
);
__m128
v3
=
_mm_cvtpi8_ps
(
_mm_unpackhi_pi32
(
high
,
high
));
__m128
sum0
=
_mm_add_ps
(
v0
,
v1
);
__m128
sum1
=
_mm_add_ps
(
v2
,
v3
);
__m128
sum
=
_mm_add_ps
(
sum0
,
sum1
);
float
ret0
=
_mm_cvtss_f32
(
sum
);
float
ret1
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
float
ret2
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
2
,
2
,
2
,
2
)));
float
ret3
=
_mm_cvtss_f32
(
_mm_shuffle_ps
(
sum
,
sum
,
_MM_SHUFFLE
(
3
,
3
,
3
,
3
)));
return
(
int8_t
)(
Min
(
Min
(
ret0
,
ret1
),
Min
(
ret2
,
ret3
)));
#else
int8_t
min
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
min
=
Min
(
min
,
Vector
[
i
]);
}
return
min
;
#endif
}
#define Saturate(x, lower, upper) \
(x) > (upper) ? (upper) : ((x) >= (lower) ? (x) : (lower))
//! convert to the short type with the lower bit fill the real data, the high bite
//! will repeat the lower bit
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32ToInt8
(
GI_FLOAT32
src
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
src
);
int16x8_t
mid_s16
=
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres0
));
int8x8_t
ret
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
mid_s16
),
vqmovn_s32
(
mid_s16
)));
return
vcombine_s16
(
ret
,
ret
);
#else
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vfhalf
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vfneg_half
=
vdupq_n_f32
(
-
0.5
f
);
float32x4_t
vinc0
=
vbslq_f32
(
vcgeq_f32
(
src
,
vzero
),
vfhalf
,
vfneg_half
);
int32x4_t
vres0
=
vcvtq_s32_f32
(
vaddq_f32
(
src
,
vinc0
));
int16x8_t
mid_s16
=
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres0
));
int8x8_t
ret
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
mid_s16
),
vqmovn_s32
(
mid_s16
)));
return
vcombine_s16
(
ret
,
ret
);
#endif
#elif defined(GI_SSE42_INTRINSICS)
__m128
vfzero
=
_mm_set1_ps
(
0.
f
);
__m128
vfhalf
=
_mm_set1_ps
(
0.5
f
);
__m128
vfneg_half
=
_mm_set1_ps
(
-
0.5
f
);
__m128
vfmin_int8
=
_mm_set1_ps
(
-
128.
f
);
__m128
vfmax_int8
=
_mm_set1_ps
(
127.
f
);
__m128
vinc0
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
src
,
vfzero
));
__m128
vres0
=
_mm_add_ps
(
src
,
vinc0
);
vres0
=
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres0
=
_mm_min_ps
(
_mm_max_ps
(
vres0
,
vfmin_int8
),
vfmax_int8
);
__m128i
vepi32_0
=
_mm_cvtps_epi32
(
vres0
);
__m128i
vepi16
=
_mm_packs_epi32
(
vepi32_0
,
vepi32_0
);
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16
,
vepi16
);
return
vepi8
;
#else
GI_INT8
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
int8_t
data
=
Saturate
(
round
(
src
[
i
]),
-
128
,
127
);
ret
[
i
]
=
data
;
ret
[
length
+
i
]
=
data
;
ret
[
2
*
length
+
i
]
=
data
;
ret
[
3
*
length
+
i
]
=
data
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V2ToInt8
(
GI_FLOAT32_V2
vsrc
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
vsrc
.
val
[
0
]);
int32x4_t
vres1
=
vcvtaq_s32_f32
(
vsrc
.
val
[
1
]);
int8x8_t
mid1
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres1
)));
return
vcombine_s8
(
mid1
,
mid1
);
#else
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vfhalf
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vfneg_half
=
vdupq_n_f32
(
-
0.5
f
);
float32x4_t
vinc0
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
0
],
vzero
),
vfhalf
,
vfneg_half
);
float32x4_t
vinc1
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
1
],
vzero
),
vfhalf
,
vfneg_half
);
int32x4_t
vres0
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
0
],
vinc0
));
int32x4_t
vres1
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
1
],
vinc1
));
int8x8_t
mid1
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres1
)));
return
vcombine_s8
(
mid1
,
mid1
);
#endif
#elif defined(GI_SSE42_INTRINSICS)
__m128
vfzero
=
_mm_set1_ps
(
0.
f
);
__m128
vfhalf
=
_mm_set1_ps
(
0.5
f
);
__m128
vfneg_half
=
_mm_set1_ps
(
-
0.5
f
);
__m128
vfmin_int8
=
_mm_set1_ps
(
-
128.
f
);
__m128
vfmax_int8
=
_mm_set1_ps
(
127.
f
);
__m128
vinc0
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
0
],
vfzero
));
__m128
vinc1
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
1
],
vfzero
));
__m128
vres0
=
_mm_add_ps
(
vsrc
.
val
[
0
],
vinc0
);
__m128
vres1
=
_mm_add_ps
(
vsrc
.
val
[
1
],
vinc1
);
vres0
=
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres1
=
_mm_round_ps
(
vres1
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres0
=
_mm_min_ps
(
_mm_max_ps
(
vres0
,
vfmin_int8
),
vfmax_int8
);
vres1
=
_mm_min_ps
(
_mm_max_ps
(
vres1
,
vfmin_int8
),
vfmax_int8
);
__m128i
vepi32_0
=
_mm_cvtps_epi32
(
vres0
);
__m128i
vepi32_1
=
_mm_cvtps_epi32
(
vres1
);
__m128i
vepi16_0
=
_mm_packs_epi32
(
vepi32_0
,
vepi32_1
);
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16_0
,
vepi16_0
);
return
vepi8
;
#else
GI_INT8
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
2
*
length
;
i
++
)
{
ret
[
i
]
=
Saturate
(
round
(
vsrc
.
val
[
i
/
length
][
i
%
length
]),
-
128
,
127
);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_INT8
GiCvtFromFloat32V4ToInt8
(
GI_FLOAT32_V4
vsrc
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
int32x4_t
vres0
=
vcvtaq_s32_f32
(
vsrc
.
val
[
0
]);
int32x4_t
vres1
=
vcvtaq_s32_f32
(
vsrc
.
val
[
1
]);
int32x4_t
vres2
=
vcvtaq_s32_f32
(
vsrc
.
val
[
1
]);
int32x4_t
vres3
=
vcvtaq_s32_f32
(
vsrc
.
val
[
1
]);
int8x8_t
mid1
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres1
)));
int8x8_t
mid2
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres2
),
vqmovn_s32
(
vres3
)));
return
vcombine_s8
(
mid1
,
mid2
);
#else
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vfhalf
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vfneg_half
=
vdupq_n_f32
(
-
0.5
f
);
float32x4_t
vinc0
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
0
],
vzero
),
vfhalf
,
vfneg_half
);
float32x4_t
vinc1
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
1
],
vzero
),
vfhalf
,
vfneg_half
);
float32x4_t
vinc2
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
2
],
vzero
),
vfhalf
,
vfneg_half
);
float32x4_t
vinc3
=
vbslq_f32
(
vcgeq_f32
(
vsrc
.
val
[
3
],
vzero
),
vfhalf
,
vfneg_half
);
int32x4_t
vres0
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
0
],
vinc0
));
int32x4_t
vres1
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
1
],
vinc1
));
int32x4_t
vres2
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
2
],
vinc2
));
int32x4_t
vres3
=
vcvtq_s32_f32
(
vaddq_f32
(
vsrc
.
val
[
3
],
vinc3
));
int8x8_t
mid1
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres0
),
vqmovn_s32
(
vres1
)));
int8x8_t
mid2
=
vqmovn_s16
(
vcombine_s16
(
vqmovn_s32
(
vres2
),
vqmovn_s32
(
vres3
)));
return
vcombine_s8
(
mid1
,
mid2
);
#endif
#elif defined(GI_SSE42_INTRINSICS)
__m128
vfzero
=
_mm_set1_ps
(
0.
f
);
__m128
vfhalf
=
_mm_set1_ps
(
0.5
f
);
__m128
vfneg_half
=
_mm_set1_ps
(
-
0.5
f
);
__m128
vfmin_int8
=
_mm_set1_ps
(
-
128.
f
);
__m128
vfmax_int8
=
_mm_set1_ps
(
127.
f
);
__m128
vinc0
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
0
],
vfzero
));
__m128
vinc1
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
1
],
vfzero
));
__m128
vinc2
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
2
],
vfzero
));
__m128
vinc3
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
vsrc
.
val
[
3
],
vfzero
));
__m128
vres0
=
_mm_add_ps
(
vsrc
.
val
[
0
],
vinc0
);
__m128
vres1
=
_mm_add_ps
(
vsrc
.
val
[
1
],
vinc1
);
__m128
vres2
=
_mm_add_ps
(
vsrc
.
val
[
2
],
vinc2
);
__m128
vres3
=
_mm_add_ps
(
vsrc
.
val
[
3
],
vinc3
);
vres0
=
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres1
=
_mm_round_ps
(
vres1
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres2
=
_mm_round_ps
(
vres2
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres3
=
_mm_round_ps
(
vres1
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
);
vres0
=
_mm_min_ps
(
_mm_max_ps
(
vres0
,
vfmin_int8
),
vfmax_int8
);
vres1
=
_mm_min_ps
(
_mm_max_ps
(
vres1
,
vfmin_int8
),
vfmax_int8
);
vres2
=
_mm_min_ps
(
_mm_max_ps
(
vres2
,
vfmin_int8
),
vfmax_int8
);
vres3
=
_mm_min_ps
(
_mm_max_ps
(
vres3
,
vfmin_int8
),
vfmax_int8
);
__m128i
vepi32_0
=
_mm_cvtps_epi32
(
vres0
);
__m128i
vepi32_1
=
_mm_cvtps_epi32
(
vres1
);
__m128i
vepi32_2
=
_mm_cvtps_epi32
(
vres2
);
__m128i
vepi32_3
=
_mm_cvtps_epi32
(
vres3
);
__m128i
vepi16_0
=
_mm_packs_epi32
(
vepi32_0
,
vepi32_1
);
__m128i
vepi16_1
=
_mm_packs_epi32
(
vepi32_2
,
vepi32_3
);
__m128i
vepi8
=
_mm_packs_epi16
(
vepi16_0
,
vepi16_1
);
return
vepi8
;
#else
GI_INT8
ret
;
int
length
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
for
(
int
i
=
0
;
i
<
4
*
length
;
i
++
)
{
ret
[
i
]
=
Saturate
(
round
(
vsrc
.
val
[
i
/
length
][
i
%
length
]),
-
128
,
127
);
}
return
ret
;
#endif
}
// vim: syntax=cpp.doxygen
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录