Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
10f23778
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
10f23778
编写于
2月 17, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(fallback): add simd general intrinsic
GitOrigin-RevId: ad78ba689fe5a1a142964bc508a4da625705cdb8
上级
286051ed
变更
3
展开全部
隐藏空白更改
内联
并排
Showing
3 changed file
with
1515 addition
and
0 deletion
+1515
-0
dnn/src/fallback/general_intrinsic/gi_common.h
dnn/src/fallback/general_intrinsic/gi_common.h
+186
-0
dnn/src/fallback/general_intrinsic/gi_float.h
dnn/src/fallback/general_intrinsic/gi_float.h
+596
-0
dnn/src/fallback/general_intrinsic/gi_int.h
dnn/src/fallback/general_intrinsic/gi_int.h
+733
-0
未找到文件。
dnn/src/fallback/general_intrinsic/gi_common.h
0 → 100644
浏览文件 @
10f23778
/**
* \file dnn/src/fallback/general_intrinsic/gi_common.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "math.h"
#include "stdint.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#else
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#define GI_TARGET_ARM
#endif
#if defined(__x86_64__) || defined(__i386__)
#include <cpuid.h>
#include <immintrin.h>
#define GI_TARGET_X86
#endif
#endif
#ifdef _WIN32
//! GI stand for general intrinsic
#define GI_DECLSPEC_ALIGN(variable, alignment) DECLSPEC_ALIGN(alignment) variable
#else
#define GI_DECLSPEC_ALIGN(variable, alignment) \
variable __attribute__((aligned(alignment)))
#endif
#if defined(_MSC_VER)
#define GI_FORCEINLINE __forceinline
#else
#define GI_FORCEINLINE __attribute__((always_inline)) inline
#endif
#if defined(_MSC_VER)
#define GI_INTERNAL_DATA extern "C"
#else
#define GI_INTERNAL_DATA extern "C" __attribute((visibility("hidden")))
#endif
#if defined(GI_TARGET_ARM)
#define GI_NEON_INTRINSICS
#if defined(__aarch64__)
#define GI_NEON64_INTRINSICS
#else
#define GI_NEON32_INTRINSICS
#endif
#elif defined(GI_TARGET_X86)
//#if defined(__FMA__)
//#define GI_FMA_INTRINSICS
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX2__)
//#define GI_AVX2_INTRINSICS
//#define GI_AVX_INTRINSICS
//#elif defined(__AVX__)
//#define GI_AVX_INTRINSICS
#if defined(__SSE4_2__)
#define GI_SSE42_INTRINSICS
#define GI_SSE2_INTRINSICS
#elif defined(__SSE2__)
#define GI_SSE2_INTRINSICS
#endif
#endif
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
typedef
__m256
GI_FLOAT32
;
typedef
__m256i
GI_UINT8
;
typedef
__m256i
GI_INT8
;
typedef
__m256i
GI_INT16
;
typedef
__m256i
GI_INT32
;
#elif defined(GI_NEON_INTRINSICS)
typedef
float32x4_t
GI_FLOAT32
;
typedef
uint8x16_t
GI_UINT8
;
typedef
int8x16_t
GI_INT8
;
typedef
int16x8_t
GI_INT16
;
typedef
int32x4_t
GI_INT32
;
#elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS)
typedef
__m128
GI_FLOAT32
;
typedef
__m128i
GI_UINT8
;
typedef
__m128i
GI_INT8
;
typedef
__m128i
GI_INT16
;
typedef
__m128i
GI_INT32
;
#else
typedef
float
GI_FLOAT32
__attribute__
((
vector_size
(
16
)));
typedef
uint16_t
GI_UINT8
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT8
__attribute__
((
vector_size
(
16
)));
typedef
int16_t
GI_INT16
__attribute__
((
vector_size
(
16
)));
typedef
int32_t
GI_INT32
__attribute__
((
vector_size
(
16
)));
#endif
//! general intrinsic support dynamic length simd, if avx or avx2 the simd
//! length is 256
#if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \
defined(GI_FMA_INTRINSICS)
//! if neon and sse the simd lenght is 128
#define GI_SIMD_LEN 256
#define GI_SIMD_LEN_BYTE 32
#elif defined(GI_NEON_INTRINSICS) || defined(GI_SSE2_INTRINSICS) || \
defined(GI_SSE42_INTRINSICS)
#define GI_SIMD_LEN 128
#define GI_SIMD_LEN_BYTE 16
#else
//! if no simd hardware support, the simd is implemented by C, default set to
//! 128
#define GI_SIMD_LEN 128
#define GI_SIMD_LEN_BYTE 16
#endif
typedef
struct
{
GI_INT32
val
[
2
];
}
GI_INT32_V2
;
typedef
struct
{
GI_INT32
val
[
4
];
}
GI_INT32_V4
;
typedef
struct
{
GI_FLOAT32
val
[
2
];
}
GI_FLOAT32_V2
;
typedef
struct
{
GI_FLOAT32
val
[
4
];
}
GI_FLOAT32_V4
;
GI_FORCEINLINE
GI_INT32
GiAndInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_and_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
&
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiOrInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vorrq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_or_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
|
Vector2
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiAndNotInt32
(
GI_INT32
VectorNot
,
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vandq_s32
(
vmvnq_s32
(
VectorNot
),
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_si128
(
VectorNot
,
Vector
);
#else
return
(
~
VectorNot
)
&
Vector
;
#endif
}
GI_FORCEINLINE
GI_INT32
GiXorInt32
(
GI_INT32
Vector1
,
GI_INT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
veorq_s32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_xor_si128
(
Vector1
,
Vector2
);
#else
return
Vector1
^
Vector2
;
#endif
}
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_float.h
0 → 100644
浏览文件 @
10f23778
/**
* \file dnn/src/fallback/general_intrinsic/gi_float.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2022 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "gi_common.h"
GI_FORCEINLINE
GI_INT32
GiReinterpretAsInt32
(
GI_FLOAT32
In
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_s32_f32
(
In
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castps_si128
(
In
);
#else
return
GI_INT32
(
In
);
#endif
}
GI_FORCEINLINE
GI_INT32
GiRoundAsInt32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
#if __ARM_ARCH >= 8
return
vcvtaq_s32_f32
(
Vector
);
#else
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vfhalf
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vfneg_half
=
vdupq_n_f32
(
-
0.5
f
);
float32x4_t
vinc0
=
vbslq_f32
(
vcgeq_f32
(
Vector
,
vzero
),
vfhalf
,
vfneg_half
);
return
vcvtq_s32_f32
(
vaddq_f32
(
Vector
,
vinc0
));
#endif
#elif defined(GI_SSE2_INTRINSICS)
__m128
vfzero
=
_mm_set1_ps
(
0.
f
);
__m128
vfhalf
=
_mm_set1_ps
(
0.5
f
);
__m128
vfneg_half
=
_mm_set1_ps
(
-
0.5
f
);
__m128
vinc0
=
_mm_blendv_ps
(
vfneg_half
,
vfhalf
,
_mm_cmpge_ps
(
Vector
,
vfzero
));
__m128
vres0
=
_mm_add_ps
(
Vector
,
vinc0
);
return
_mm_castps_si128
(
_mm_round_ps
(
vres0
,
_MM_FROUND_TO_ZERO
|
_MM_FROUND_NO_EXC
));
#else
GI_INT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
(
int32_t
)
round
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiCastToFloat32
(
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vcvtq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_cvtepi32_ps
(
Vector
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
i
++
)
{
ret
[
i
]
=
float
(
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiReinterpretAsFloat32
(
GI_INT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_s32
(
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_castsi128_ps
(
Vector
);
#else
return
GI_FLOAT32
(
Vector
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
float
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_set1_ps
(
Value
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBroadcastFloat32
(
const
float
*
Value
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_dup_f32
(
Value
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_load_ps1
(
Value
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
*
Value
;
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiZeroFloat32
(
void
)
{
#if defined(GI_NEON_INTRINSICS)
return
vdupq_n_f32
(
0.0
f
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_setzero_ps
();
#else
return
GiBroadcastFloat32
(
0.0
f
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiLoadFloat32
(
const
float
*
Buffer
)
{
#if defined(GI_NEON_INTRINSICS)
return
vld1q_f32
(
Buffer
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_loadu_ps
(
Buffer
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
[
i
]
=
Buffer
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
void
GiStoreFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_storeu_ps
(
Buffer
,
Vector
);
#else
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
Buffer
[
i
]
=
Vector
[
i
];
}
#endif
}
GI_FORCEINLINE
void
GiStoreAlignedFloat32
(
float
*
Buffer
,
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON_INTRINSICS)
vst1q_f32
(
Buffer
,
Vector
);
#elif defined(GI_SSE2_INTRINSICS)
_mm_store_ps
(
Buffer
,
Vector
);
#else
GiStoreFloat32
(
Buffer
,
Vector
);
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
vst1q_lane_f32(Buffer, Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
_mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GISTORELANEFLOAT32(i) \
GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \
*Buffer = Vector[i]; \
}
#endif
GISTORELANEFLOAT32
(
0
)
GISTORELANEFLOAT32
(
1
)
GISTORELANEFLOAT32
(
2
)
GISTORELANEFLOAT32
(
3
)
#undef GISTORELANEFLOAT32
#if defined(GI_NEON_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return vgetq_lane_f32(Vector, i); \
}
#elif defined(GI_SSE2_INTRINSICS)
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
}
#else
#define GIEXTRACTLANEFLOAT32(i) \
GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \
return Vector[i]; \
}
#endif
GIEXTRACTLANEFLOAT32
(
0
)
GIEXTRACTLANEFLOAT32
(
1
)
GIEXTRACTLANEFLOAT32
(
2
)
GIEXTRACTLANEFLOAT32
(
3
)
#undef GIEXTRACTLANEFLOAT32
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveLowFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip1q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
zipped
=
vzipq_f32
(
Vector1
,
Vector2
);
return
zipped
.
val
[
0
];
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpacklo_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiInterleaveHighFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vzip2q_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
zipped
=
vzipq_f32
(
Vector1
,
Vector2
);
return
zipped
.
val
[
1
];
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_unpackhi_ps
(
Vector1
,
Vector2
);
#else
GI_FLOAT32
ret
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
2
/
sizeof
(
float
);
i
++
)
{
ret
[
2
*
i
]
=
Vector1
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
ret
[
2
*
i
+
1
]
=
Vector2
[
GI_SIMD_LEN_BYTE
/
2
+
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAddFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vaddq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
+
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiSubtractFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vsubq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_sub_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
-
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_mul_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyScalerFloat32
(
GI_FLOAT32
Vector1
,
float
Scaler
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmulq_n_f32
(
Vector1
,
Scaler
);
#elif defined(GI_SSE2_INTRINSICS)
GI_FLOAT32
Vector2
=
_mm_set1_ps
(
Scaler
);
return
_mm_mul_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
*
Scaler
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddVecFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_f32
(
VectorSum
,
Vector1
,
Vector2
);
#elif defined(GI_FMA3_INTRINSICS)
return
_mm_fmadd_ps
(
Vector1
,
Vector2
,
VectorSum
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_add_ps
(
_mm_mul_ps
(
Vector1
,
Vector2
),
VectorSum
);
#else
return
Vector1
*
Vector2
+
VectorSum
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMultiplyAddScalarFloat32
(
GI_FLOAT32
VectorSum
,
GI_FLOAT32
Vector
,
float
Scalar
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmlaq_n_f32
(
VectorSum
,
Vector
,
Scalar
);
#elif defined(GI_SSE2_INTRINSICS)
return
GiMultiplyAddVecFloat32
(
VectorSum
,
GiBroadcastFloat32
(
Scalar
),
Vector
);
#else
return
VectorSum
+
Vector
*
Scalar
;
#endif
}
#if defined(GI_NEON_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
#undef GIMULTIPLYADDLANFLOAT32
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
}
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#elif defined(GI_SSE2_INTRINSICS)
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return GiMultiplyAddScalarFloat32( \
VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#else
#define GIMULTIPLYADDLANFLOAT32(i) \
GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \
GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \
return VectorSum + Vector1 * Vector2[i]; \
}
GIMULTIPLYADDLANFLOAT32
(
0
)
GIMULTIPLYADDLANFLOAT32
(
1
)
GIMULTIPLYADDLANFLOAT32
(
2
)
GIMULTIPLYADDLANFLOAT32
(
3
)
#undef GIMULTIPLYADDLANFLOAT32
#endif
GI_FORCEINLINE
GI_FLOAT32
GiDivideFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vdivq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_NEON32_INTRINSICS)
float32x4_t
recp
=
vrecpeq_f32
(
Vector2
);
recp
=
vmulq_f32
(
vrecpsq_f32
(
Vector2
,
recp
),
recp
);
return
vmulq_f32
(
Vector1
,
recp
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_div_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
/
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiGreaterThanFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vreinterpretq_f32_u32
(
vcgtq_f32
(
Vector1
,
Vector2
));
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_cmpgt_ps
(
Vector1
,
Vector2
);
#else
return
Vector1
>
Vector2
;
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_and_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiAndInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiOrFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_or_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiOrInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiAndNotFloat32
(
GI_FLOAT32
VectorNot
,
GI_FLOAT32
Vector
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_andnot_ps
(
VectorNot
,
Vector
);
#else
return
GiReinterpretAsFloat32
(
GiAndNotInt32
(
GiReinterpretAsInt32
(
VectorNot
),
GiReinterpretAsInt32
(
Vector
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiXorFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_SSE2_INTRINSICS)
return
_mm_xor_ps
(
Vector1
,
Vector2
);
#else
return
GiReinterpretAsFloat32
(
GiXorInt32
(
GiReinterpretAsInt32
(
Vector1
),
GiReinterpretAsInt32
(
Vector2
)));
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiBlendFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
,
GI_FLOAT32
Selection
)
{
return
GiOrFloat32
(
GiAndFloat32
(
Vector2
,
Selection
),
GiAndNotFloat32
(
Selection
,
Vector1
));
}
GI_FORCEINLINE
GI_FLOAT32
GiMaximumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vmaxq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_max_ps
(
Vector1
,
Vector2
);
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector1
>
Vector2
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiMinimumFloat32
(
GI_FLOAT32
Vector1
,
GI_FLOAT32
Vector2
)
{
#if defined(GI_NEON_INTRINSICS)
return
vminq_f32
(
Vector1
,
Vector2
);
#elif defined(GI_SSE2_INTRINSICS)
return
_mm_min_ps
(
Vector1
,
Vector2
);
#else
return
GiBlendFloat32
(
Vector2
,
Vector1
,
Vector2
>
Vector1
);
#endif
}
GI_FORCEINLINE
GI_FLOAT32
GiClampFloat32
(
GI_FLOAT32
Value
,
float
LowerRange
,
float
UpperRange
)
{
Value
=
GiMaximumFloat32
(
GiBroadcastFloat32
(
LowerRange
),
Value
);
Value
=
GiMinimumFloat32
(
GiBroadcastFloat32
(
UpperRange
),
Value
);
return
Value
;
}
GI_FORCEINLINE
float
GiReduceAddFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
Vector
=
vpaddq_f32
(
Vector
,
Vector
);
return
vgetq_lane_f32
(
Vector
,
0
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpadd_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpadd_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiAddFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiAddFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
0
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
+=
Vector
[
i
];
}
return
ret
;
#endif
}
GI_FORCEINLINE
float
GiReduceMultiplyFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
float32x2_t
low
=
vget_low_f32
(
Vector
);
float32x2_t
high
=
vget_high_f32
(
Vector
);
float32x2_t
res
=
vmul_f32
(
low
,
high
);
return
vget_lane_f32
(
res
,
0
)
*
vget_lane_f32
(
res
,
1
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMultiplyFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMultiplyFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
1
;
for
(
size_t
i
=
0
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
*=
Vector
[
i
];
}
return
ret
;
#endif
}
#define Max(a, b) (a) > (b) ? (a) : (b)
#define Min(a, b) (a) < (b) ? (a) : (b)
GI_FORCEINLINE
float
GiReduceMaximumFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vmaxvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmax_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_VSX_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
GI_FLOAT32
(
vec_splat
((
__vector
long
long
)
Vector
,
1
)));
Vector
=
GiMaximumFloat32
(
Vector
,
vec_splat
(
Vector
,
1
));
return
Vector
[
0
];
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMaximumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMaximumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
Max
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
}
GI_FORCEINLINE
float
GiReduceMinimumFloat32
(
GI_FLOAT32
Vector
)
{
#if defined(GI_NEON64_INTRINSICS)
return
vminvq_f32
(
Vector
);
#elif defined(GI_NEON32_INTRINSICS)
float32x2_t
VectorLow
=
vget_low_f32
(
Vector
);
float32x2_t
VectorHigh
=
vget_high_f32
(
Vector
);
VectorLow
=
vpmin_f32
(
VectorLow
,
VectorHigh
);
VectorLow
=
vpmin_f32
(
VectorLow
,
VectorHigh
);
return
vget_lane_f32
(
VectorLow
,
0
);
#elif defined(GI_SSE2_INTRINSICS)
Vector
=
GiMinimumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
2
,
3
,
2
,
3
)));
Vector
=
GiMinimumFloat32
(
Vector
,
_mm_shuffle_ps
(
Vector
,
Vector
,
_MM_SHUFFLE
(
1
,
1
,
1
,
1
)));
return
GiExtractLane0Float32
(
Vector
);
#else
float
ret
=
Vector
[
0
];
for
(
size_t
i
=
1
;
i
<
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
i
++
)
{
ret
=
Min
(
ret
,
Vector
[
i
]);
}
return
ret
;
#endif
}
// vim: syntax=cpp.doxygen
dnn/src/fallback/general_intrinsic/gi_int.h
0 → 100644
浏览文件 @
10f23778
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录