Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ff7d2464
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ff7d2464
编写于
6月 21, 2022
作者:
W
wangzhen38
提交者:
GitHub
6月 21, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cpplint fix 3 (#43679)
* cpplint fix 3 * cpplint fix 3 * cpplint fix 3 * cpplint fix 3
上级
7307e955
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
160 addition
and
125 deletion
+160
-125
paddle/phi/kernels/funcs/detail/avx_mathfun.h
paddle/phi/kernels/funcs/detail/avx_mathfun.h
+160
-125
未找到文件。
paddle/phi/kernels/funcs/detail/avx_mathfun.h
浏览文件 @
ff7d2464
...
...
@@ -41,7 +41,7 @@
(this is the zlib license)
*/
#pragma once
#include "paddle/fluid/platform/cpu_info.h"
/* __m128 is ugly to write */
...
...
@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
//
#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2
(
slli_epi32
)
AVX2_BITOP_USING_SSE2
(
srli_epi32
)
...
...
@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 integer ops"
//
#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2
(
and_si128
)
AVX2_INTOP_USING_SSE2
(
andnot_si128
)
AVX2_INTOP_USING_SSE2
(
cmpeq_epi32
)
...
...
@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/
v8sf
log256_ps
(
v8sf
x
)
{
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
;
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf
invalid_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_LE_OS
);
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_min_norm_pos
);
/* cut off denormalized stuff */
/* cut off denormalized stuff */
x
=
_mm256_max_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_min_norm_pos
));
// can be done with AVX2
imm0
=
avx2_mm256_srli_epi32
(
_mm256_castps_si256
(
x
),
23
);
/* keep only the fractional part */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_mant_mask
);
x
=
_mm256_or_ps
(
x
,
*
(
v8sf
*
)
_ps256_0p5
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_mant_mask
)
);
x
=
_mm256_or_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
// this is again another AVX2 instruction
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0x7f
));
v8sf
e
=
_mm256_cvtepi32_ps
(
imm0
);
e
=
_mm256_add_ps
(
e
,
one
);
...
...
@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; }
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_SQRTHF
,
_CMP_LT_OS
);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_SQRTHF
),
_CMP_LT_OS
);
v8sf
tmp
=
_mm256_and_ps
(
x
,
mask
);
x
=
_mm256_sub_ps
(
x
,
one
);
e
=
_mm256_sub_ps
(
e
,
_mm256_and_ps
(
one
,
mask
));
...
...
@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_log_p0
;
v8sf
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p0
)
;
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p1
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p2
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p3
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p3
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p4
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p4
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p5
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p5
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p6
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p6
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p7
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p7
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p8
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p8
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q1
);
tmp
=
_mm256_mul_ps
(
e
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_q1
)
);
y
=
_mm256_add_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q2
);
tmp
=
_mm256_mul_ps
(
e
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_q2
)
);
x
=
_mm256_add_ps
(
x
,
y
);
x
=
_mm256_add_ps
(
x
,
tmp
);
x
=
_mm256_or_ps
(
x
,
invalid_mask
);
// negative arg will be NAN
...
...
@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf
exp256_ps
(
v8sf
x
)
{
v8sf
tmp
=
_mm256_setzero_ps
(),
fx
;
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
;
x
=
_mm256_min_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_hi
);
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_lo
);
x
=
_mm256_min_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_exp_hi
)
);
x
=
_mm256_max_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_exp_lo
)
);
/* express exp(x) as exp(g + n*log(2)) */
fx
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_LOG2EF
);
fx
=
_mm256_add_ps
(
fx
,
*
(
v8sf
*
)
_ps256_0p5
);
fx
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_LOG2EF
)
);
fx
=
_mm256_add_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
...
...
@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
mask
=
_mm256_and_ps
(
mask
,
one
);
fx
=
_mm256_sub_ps
(
tmp
,
mask
);
tmp
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C1
);
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C2
);
tmp
=
_mm256_mul_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_C1
));
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_C2
));
x
=
_mm256_sub_ps
(
x
,
tmp
);
x
=
_mm256_sub_ps
(
x
,
z
);
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_exp_p0
;
v8sf
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p0
)
;
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p1
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p2
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p3
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p3
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p4
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p4
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p5
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p5
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
one
);
...
...
@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0
=
_mm256_cvttps_epi32
(
fx
);
// another two AVX2 instructions
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0x7f
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
23
);
v8sf
pow2n
=
_mm256_castsi256_ps
(
imm0
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
...
...
@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit
=
x
;
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* extract the sign bit (upper one) */
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sign_mask
));
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
/*
Here we start a series of integer operations, which are in the
...
...
@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
/* get the swap sign flag */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
...
...
@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed.
*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x
#endif
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
/* get the swap sign flag */
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask */
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm0_1
=
_mm_andnot_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_andnot_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin
=
x
;
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* extract the sign bit (upper one) */
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sign_mask
));
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
#ifdef __AVX2__
/* store the integer part of y in imm2 */
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm4
=
imm2
;
/* get the swap sign flag for the sine */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
...
...
@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1
=
imm2_1
;
imm4_2
=
imm2_2
;
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x
=
_mm256_add_ps
(
x
,
xmm3
);
#ifdef __AVX2__
imm4
=
avx2_mm256_sub_epi32
(
imm4
,
*
(
v8si
*
)
_pi32_256_2
);
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
(
v8si
*
)
_pi32_256_4
);
imm4
=
avx2_mm256_sub_epi32
(
imm4
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm4
=
avx2_mm256_slli_epi32
(
imm4
,
29
);
#else
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm4_1
=
_mm_andnot_si128
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm4_1
=
_mm_andnot_si128
(
imm4_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm4_1
=
_mm_slli_epi32
(
imm4_1
,
29
);
imm4_2
=
_mm_slli_epi32
(
imm4_2
,
29
);
...
...
@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录