Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ff7d2464
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ff7d2464
编写于
6月 21, 2022
作者:
W
wangzhen38
提交者:
GitHub
6月 21, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cpplint fix 3 (#43679)
* cpplint fix 3 * cpplint fix 3 * cpplint fix 3 * cpplint fix 3
上级
7307e955
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
160 addition
and
125 deletion
+160
-125
paddle/phi/kernels/funcs/detail/avx_mathfun.h
paddle/phi/kernels/funcs/detail/avx_mathfun.h
+160
-125
未找到文件。
paddle/phi/kernels/funcs/detail/avx_mathfun.h
浏览文件 @
ff7d2464
...
@@ -41,7 +41,7 @@
...
@@ -41,7 +41,7 @@
(this is the zlib license)
(this is the zlib license)
*/
*/
#pragma once
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
/* __m128 is ugly to write */
/* __m128 is ugly to write */
...
@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
...
@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
return (ret); \
return (ret); \
}
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
//
#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2
(
slli_epi32
)
AVX2_BITOP_USING_SSE2
(
slli_epi32
)
AVX2_BITOP_USING_SSE2
(
srli_epi32
)
AVX2_BITOP_USING_SSE2
(
srli_epi32
)
...
@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
...
@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \
return (ret); \
}
}
//#warning "Using SSE2 to perform AVX2 integer ops"
//
#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2
(
and_si128
)
AVX2_INTOP_USING_SSE2
(
and_si128
)
AVX2_INTOP_USING_SSE2
(
andnot_si128
)
AVX2_INTOP_USING_SSE2
(
andnot_si128
)
AVX2_INTOP_USING_SSE2
(
cmpeq_epi32
)
AVX2_INTOP_USING_SSE2
(
cmpeq_epi32
)
...
@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
...
@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/
*/
v8sf
log256_ps
(
v8sf
x
)
{
v8sf
log256_ps
(
v8sf
x
)
{
v8si
imm0
;
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
;
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf
invalid_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_LE_OS
);
v8sf
invalid_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_LE_OS
);
x
=
_mm256_max_ps
(
/* cut off denormalized stuff */
x
,
*
(
v8sf
*
)
_ps256_min_norm_pos
);
/* cut off denormalized stuff */
x
=
_mm256_max_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_min_norm_pos
));
// can be done with AVX2
// can be done with AVX2
imm0
=
avx2_mm256_srli_epi32
(
_mm256_castps_si256
(
x
),
23
);
imm0
=
avx2_mm256_srli_epi32
(
_mm256_castps_si256
(
x
),
23
);
/* keep only the fractional part */
/* keep only the fractional part */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_mant_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_mant_mask
)
);
x
=
_mm256_or_ps
(
x
,
*
(
v8sf
*
)
_ps256_0p5
);
x
=
_mm256_or_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
// this is again another AVX2 instruction
// this is again another AVX2 instruction
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0x7f
));
v8sf
e
=
_mm256_cvtepi32_ps
(
imm0
);
v8sf
e
=
_mm256_cvtepi32_ps
(
imm0
);
e
=
_mm256_add_ps
(
e
,
one
);
e
=
_mm256_add_ps
(
e
,
one
);
...
@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
...
@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; }
} else { x = x - 1.0; }
*/
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_SQRTHF
,
_CMP_LT_OS
);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_SQRTHF
),
_CMP_LT_OS
);
v8sf
tmp
=
_mm256_and_ps
(
x
,
mask
);
v8sf
tmp
=
_mm256_and_ps
(
x
,
mask
);
x
=
_mm256_sub_ps
(
x
,
one
);
x
=
_mm256_sub_ps
(
x
,
one
);
e
=
_mm256_sub_ps
(
e
,
_mm256_and_ps
(
one
,
mask
));
e
=
_mm256_sub_ps
(
e
,
_mm256_and_ps
(
one
,
mask
));
...
@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
...
@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_log_p0
;
v8sf
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p0
)
;
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p1
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p2
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p3
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p3
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p4
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p4
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p5
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p5
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p6
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p6
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p7
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p7
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p8
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_p8
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q1
);
tmp
=
_mm256_mul_ps
(
e
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_q1
)
);
y
=
_mm256_add_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q2
);
tmp
=
_mm256_mul_ps
(
e
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_log_q2
)
);
x
=
_mm256_add_ps
(
x
,
y
);
x
=
_mm256_add_ps
(
x
,
y
);
x
=
_mm256_add_ps
(
x
,
tmp
);
x
=
_mm256_add_ps
(
x
,
tmp
);
x
=
_mm256_or_ps
(
x
,
invalid_mask
);
// negative arg will be NAN
x
=
_mm256_or_ps
(
x
,
invalid_mask
);
// negative arg will be NAN
...
@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
...
@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf
exp256_ps
(
v8sf
x
)
{
v8sf
exp256_ps
(
v8sf
x
)
{
v8sf
tmp
=
_mm256_setzero_ps
(),
fx
;
v8sf
tmp
=
_mm256_setzero_ps
(),
fx
;
v8si
imm0
;
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
;
x
=
_mm256_min_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_hi
);
x
=
_mm256_min_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_exp_hi
)
);
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_lo
);
x
=
_mm256_max_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_exp_lo
)
);
/* express exp(x) as exp(g + n*log(2)) */
/* express exp(x) as exp(g + n*log(2)) */
fx
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_LOG2EF
);
fx
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_LOG2EF
)
);
fx
=
_mm256_add_ps
(
fx
,
*
(
v8sf
*
)
_ps256_0p5
);
fx
=
_mm256_add_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
/* how to perform a floorf with SSE: just below */
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
// imm0 = _mm256_cvttps_epi32(fx);
...
@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
...
@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
mask
=
_mm256_and_ps
(
mask
,
one
);
mask
=
_mm256_and_ps
(
mask
,
one
);
fx
=
_mm256_sub_ps
(
tmp
,
mask
);
fx
=
_mm256_sub_ps
(
tmp
,
mask
);
tmp
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C1
);
tmp
=
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C2
);
_mm256_mul_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_C1
));
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_C2
));
x
=
_mm256_sub_ps
(
x
,
tmp
);
x
=
_mm256_sub_ps
(
x
,
tmp
);
x
=
_mm256_sub_ps
(
x
,
z
);
x
=
_mm256_sub_ps
(
x
,
z
);
z
=
_mm256_mul_ps
(
x
,
x
);
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_exp_p0
;
v8sf
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p0
)
;
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p1
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p2
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p3
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p3
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p4
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p4
)
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p5
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_exp_p5
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
one
);
y
=
_mm256_add_ps
(
y
,
one
);
...
@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
...
@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
/* build 2^n */
imm0
=
_mm256_cvttps_epi32
(
fx
);
imm0
=
_mm256_cvttps_epi32
(
fx
);
// another two AVX2 instructions
// another two AVX2 instructions
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0x7f
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
23
);
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
23
);
v8sf
pow2n
=
_mm256_castsi256_ps
(
imm0
);
v8sf
pow2n
=
_mm256_castsi256_ps
(
imm0
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
...
@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x
...
@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit
=
x
;
sign_bit
=
x
;
/* take the absolute value */
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* extract the sign bit (upper one) */
/* extract the sign bit (upper one) */
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sign_mask
));
/* scale by 4/Pi */
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
/*
/*
Here we start a series of integer operations, which are in the
Here we start a series of integer operations, which are in the
...
@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x
...
@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x
imm2
=
_mm256_cvttps_epi32
(
y
);
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
// another two AVX2 instruction
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
/* get the swap sign flag */
/* get the swap sign flag */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
there is one polynom for 0 <= x <= Pi/4
...
@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x
...
@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed.
Both branches will be computed.
*/
*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
#else
#else
/* we use SSE2 routines to perform the integer ops */
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x
...
@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x
...
@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x
...
@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x
#endif
#endif
/* take the absolute value */
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* scale by 4/Pi */
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
#ifdef __AVX2__
#ifdef __AVX2__
/* store the integer part of y in mm0 */
/* store the integer part of y in mm0 */
imm2
=
_mm256_cvttps_epi32
(
y
);
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
/* get the swap sign flag */
/* get the swap sign flag */
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask */
/* get the polynom selection mask */
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
#else
#else
/* we use SSE2 routines to perform the integer ops */
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm0_1
=
_mm_andnot_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
_mm_andnot_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x
...
@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x
...
@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
...
@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin
=
x
;
sign_bit_sin
=
x
;
/* take the absolute value */
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_inv_sign_mask
)
);
/* extract the sign bit (upper one) */
/* extract the sign bit (upper one) */
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sign_mask
));
/* scale by 4/Pi */
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_cephes_FOPI
)
);
#ifdef __AVX2__
#ifdef __AVX2__
/* store the integer part of y in imm2 */
/* store the integer part of y in imm2 */
imm2
=
_mm256_cvttps_epi32
(
y
);
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
avx2_mm256_add_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_1
));
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_inv1
));
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm4
=
imm2
;
imm4
=
imm2
;
/* get the swap sign flag for the sine */
/* get the swap sign flag for the sine */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
/* get the polynom selection mask for the sine*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
avx2_mm256_and_si256
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_0
));
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
#else
/* we use SSE2 routines to perform the integer ops */
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_1
)
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_inv1
));
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
...
@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
...
@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1
=
imm2_1
;
imm4_1
=
imm2_1
;
imm4_2
=
imm2_2
;
imm4_2
=
imm2_2
;
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
)
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
...
@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic"
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm1
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP1
)
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP2
)
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm3
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_minus_cephes_DP3
)
;
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
...
@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x
=
_mm256_add_ps
(
x
,
xmm3
);
x
=
_mm256_add_ps
(
x
,
xmm3
);
#ifdef __AVX2__
#ifdef __AVX2__
imm4
=
avx2_mm256_sub_epi32
(
imm4
,
*
(
v8si
*
)
_pi32_256_2
);
imm4
=
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
(
v8si
*
)
_pi32_256_4
);
avx2_mm256_sub_epi32
(
imm4
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_2
));
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
reinterpret_cast
<
const
v8si
*>
(
_pi32_256_4
));
imm4
=
avx2_mm256_slli_epi32
(
imm4
,
29
);
imm4
=
avx2_mm256_slli_epi32
(
imm4
,
29
);
#else
#else
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_2
)
);
imm4_1
=
_mm_andnot_si128
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm4_1
=
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_4
);
_mm_andnot_si128
(
imm4_1
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
reinterpret_cast
<
const
v4si
*>
(
_pi32avx_4
));
imm4_1
=
_mm_slli_epi32
(
imm4_1
,
29
);
imm4_1
=
_mm_slli_epi32
(
imm4_1
,
29
);
imm4_2
=
_mm_slli_epi32
(
imm4_2
,
29
);
imm4_2
=
_mm_slli_epi32
(
imm4_2
,
29
);
...
@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
...
@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p0
)
;
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p1
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_coscof_p2
)
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_0p5
)
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_1
)
);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p0
)
;
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p1
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
reinterpret_cast
<
const
v8sf
*>
(
_ps256_sincof_p2
)
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录