未验证 提交 ff7d2464 编写于 作者: W wangzhen38 提交者: GitHub

cpplint fix 3 (#43679)

* cpplint fix 3

* cpplint fix 3

* cpplint fix 3

* cpplint fix 3
上级 7307e955
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
(this is the zlib license) (this is the zlib license)
*/ */
#pragma once
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
/* __m128 is ugly to write */ /* __m128 is ugly to write */
...@@ -134,7 +134,7 @@ typedef union imm_xmm_union { ...@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
return (ret); \ return (ret); \
} }
//#warning "Using SSE2 to perform AVX2 bitshift ops" // #warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32) AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32) AVX2_BITOP_USING_SSE2(srli_epi32)
...@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32) ...@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \ return (ret); \
} }
//#warning "Using SSE2 to perform AVX2 integer ops" // #warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128) AVX2_INTOP_USING_SSE2(and_si128)
AVX2_INTOP_USING_SSE2(andnot_si128) AVX2_INTOP_USING_SSE2(andnot_si128)
AVX2_INTOP_USING_SSE2(cmpeq_epi32) AVX2_INTOP_USING_SSE2(cmpeq_epi32)
...@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32) ...@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/ */
v8sf log256_ps(v8sf x) { v8sf log256_ps(v8sf x) {
v8si imm0; v8si imm0;
v8sf one = *(v8sf *)_ps256_1; v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
x = _mm256_max_ps( /* cut off denormalized stuff */
x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_min_norm_pos));
// can be done with AVX2 // can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */ /* keep only the fractional part */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_mant_mask));
x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); x = _mm256_or_ps(x, *reinterpret_cast<const v8sf *>(_ps256_0p5));
// this is again another AVX2 instruction // this is again another AVX2 instruction
imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); imm0 = avx2_mm256_sub_epi32(imm0,
*reinterpret_cast<const v8si *>(_pi32_256_0x7f));
v8sf e = _mm256_cvtepi32_ps(imm0); v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one); e = _mm256_add_ps(e, one);
...@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) { ...@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; } } else { x = x - 1.0; }
*/ */
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); v8sf mask = _mm256_cmp_ps(
x, *reinterpret_cast<const v8sf *>(_ps256_cephes_SQRTHF), _CMP_LT_OS);
v8sf tmp = _mm256_and_ps(x, mask); v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one); x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
...@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) { ...@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
v8sf z = _mm256_mul_ps(x, x); v8sf z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_log_p0; v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p0);
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p1));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p2));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p3));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p4));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p5));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p6));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p7));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p8));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q1));
y = _mm256_add_ps(y, tmp); y = _mm256_add_ps(y, tmp);
tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q2));
x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp); x = _mm256_add_ps(x, tmp);
x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
...@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); ...@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) { v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx; v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0; v8si imm0;
v8sf one = *(v8sf *)_ps256_1; v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); x = _mm256_min_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_hi));
x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_lo));
/* express exp(x) as exp(g + n*log(2)) */ /* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); fx = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_LOG2EF));
fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); fx = _mm256_add_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_0p5));
/* how to perform a floorf with SSE: just below */ /* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx); // imm0 = _mm256_cvttps_epi32(fx);
...@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) { ...@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
mask = _mm256_and_ps(mask, one); mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask); fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); tmp =
v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); _mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C1));
v8sf z =
_mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C2));
x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z); x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x, x); z = _mm256_mul_ps(x, x);
v8sf y = *(v8sf *)_ps256_cephes_exp_p0; v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p0);
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p1));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p2));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p3));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p4));
y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p5));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one); y = _mm256_add_ps(y, one);
...@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) { ...@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */ /* build 2^n */
imm0 = _mm256_cvttps_epi32(fx); imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions // another two AVX2 instructions
imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); imm0 = avx2_mm256_add_epi32(imm0,
*reinterpret_cast<const v8si *>(_pi32_256_0x7f));
imm0 = avx2_mm256_slli_epi32(imm0, 23); imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0); v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n); y = _mm256_mul_ps(y, pow2n);
...@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x ...@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x; sign_bit = x;
/* take the absolute value */ /* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); sign_bit = _mm256_and_ps(sign_bit,
*reinterpret_cast<const v8sf *>(_ps256_sign_mask));
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
/* /*
Here we start a series of integer operations, which are in the Here we start a series of integer operations, which are in the
...@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x ...@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x
imm2 = _mm256_cvttps_epi32(y); imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */ /* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction // another two AVX2 instruction
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); imm2 =
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
imm2 = avx2_mm256_and_si256(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_inv1));
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */ /* get the swap sign flag */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); imm0 =
avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
imm0 = avx2_mm256_slli_epi32(imm0, 29); imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask /* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4 there is one polynom for 0 <= x <= Pi/4
...@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x ...@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed. Both branches will be computed.
*/ */
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); imm2 =
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
imm2 = avx2_mm256_cmpeq_epi32(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_0));
#else #else
/* we use SSE2 routines to perform the integer ops */ /* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); imm2_1 =
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
imm2_2 =
_mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
...@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x ...@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic" /* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
xmm1 = _mm256_mul_ps(y, xmm1); xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2); xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3); xmm3 = _mm256_mul_ps(y, xmm3);
...@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x ...@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3); x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0; y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
v8sf z = _mm256_mul_ps(x, x); v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0; v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x); y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x); y2 = _mm256_add_ps(y2, x);
...@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x ...@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x
#endif #endif
/* take the absolute value */ /* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
#ifdef __AVX2__ #ifdef __AVX2__
/* store the integer part of y in mm0 */ /* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y); imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */ /* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); imm2 =
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
imm2 = avx2_mm256_and_si256(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_inv1));
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); imm2 =
avx2_mm256_sub_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
/* get the swap sign flag */ /* get the swap sign flag */
imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); imm0 = avx2_mm256_andnot_si256(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_4));
imm0 = avx2_mm256_slli_epi32(imm0, 29); imm0 = avx2_mm256_slli_epi32(imm0, 29);
/* get the polynom selection mask */ /* get the polynom selection mask */
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); imm2 =
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
imm2 = avx2_mm256_cmpeq_epi32(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_0));
#else #else
/* we use SSE2 routines to perform the integer ops */ /* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); imm2_1 =
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
imm2_2 =
_mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); imm2_1 = _mm_sub_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); imm2_2 = _mm_sub_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); imm0_1 =
imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); _mm_andnot_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_2 =
_mm_andnot_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
...@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x ...@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic" /* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
xmm1 = _mm256_mul_ps(y, xmm1); xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2); xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3); xmm3 = _mm256_mul_ps(y, xmm3);
...@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x ...@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x
x = _mm256_add_ps(x, xmm3); x = _mm256_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v8sf *)_ps256_coscof_p0; y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
v8sf z = _mm256_mul_ps(x, x); v8sf z = _mm256_mul_ps(x, x);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0; v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x); y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x); y2 = _mm256_add_ps(y2, x);
...@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { ...@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin = x; sign_bit_sin = x;
/* take the absolute value */ /* take the absolute value */
x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); sign_bit_sin = _mm256_and_ps(
sign_bit_sin, *reinterpret_cast<const v8sf *>(_ps256_sign_mask));
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
#ifdef __AVX2__ #ifdef __AVX2__
/* store the integer part of y in imm2 */ /* store the integer part of y in imm2 */
imm2 = _mm256_cvttps_epi32(y); imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */ /* j=(j+1) & (~1) (see the cephes sources) */
imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); imm2 =
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
imm2 = avx2_mm256_and_si256(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_inv1));
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
imm4 = imm2; imm4 = imm2;
/* get the swap sign flag for the sine */ /* get the swap sign flag for the sine */
imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); imm0 =
avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
imm0 = avx2_mm256_slli_epi32(imm0, 29); imm0 = avx2_mm256_slli_epi32(imm0, 29);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/ /* get the polynom selection mask for the sine*/
imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); imm2 =
imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
imm2 = avx2_mm256_cmpeq_epi32(imm2,
*reinterpret_cast<const v8si *>(_pi32_256_0));
// v8sf poly_mask = _mm256_castsi256_ps(imm2); // v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else #else
/* we use SSE2 routines to perform the integer ops */ /* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); imm2_1 =
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
imm2_2 =
_mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
y = _mm256_cvtepi32_ps(imm2); y = _mm256_cvtepi32_ps(imm2);
...@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { ...@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1 = imm2_1; imm4_1 = imm2_1;
imm4_2 = imm2_2; imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29);
COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
...@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { ...@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic" /* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
xmm1 = _mm256_mul_ps(y, xmm1); xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2); xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3); xmm3 = _mm256_mul_ps(y, xmm3);
...@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { ...@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x = _mm256_add_ps(x, xmm3); x = _mm256_add_ps(x, xmm3);
#ifdef __AVX2__ #ifdef __AVX2__
imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); imm4 =
imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); avx2_mm256_sub_epi32(imm4, *reinterpret_cast<const v8si *>(_pi32_256_2));
imm4 = avx2_mm256_andnot_si256(imm4,
*reinterpret_cast<const v8si *>(_pi32_256_4));
imm4 = avx2_mm256_slli_epi32(imm4, 29); imm4 = avx2_mm256_slli_epi32(imm4, 29);
#else #else
imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); imm4_1 = _mm_sub_epi32(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); imm4_1 =
imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); _mm_andnot_si128(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm4_2 =
_mm_andnot_si128(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29);
...@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { ...@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf z = _mm256_mul_ps(x, x); v8sf z = _mm256_mul_ps(x, x);
y = *(v8sf *)_ps256_coscof_p0; y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(v8sf *)_ps256_1); y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf y2 = *(v8sf *)_ps256_sincof_p0; v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x); y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x); y2 = _mm256_add_ps(y2, x);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册