cpplint fix 3 (#43679)

* cpplint fix 3 * cpplint fix 3 * cpplint fix 3 * cpplint fix 3

cpplint fix 3 (#43679)
* cpplint fix 3 * cpplint fix 3 * cpplint fix 3 * cpplint fix 3
ff7d2464 · wangzhen38 · GitHub · 7307e955 · ff7d2464
显示空白变更内容
内联并排

Showing with 160 addition and 125 deletion

paddle/phi/kernels/funcs/detail/avx_mathfun.h paddle/phi/kernels/funcs/detail/avx_mathfun.h +160 -125

未找到文件。
--- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -41,7 +41,7 @@
  (this is the zlib license)
 */
+#pragma once
 #include "paddle/fluid/platform/cpu_info.h"
 /* __m128 is ugly to write */
@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
    return (ret);                                        \
  }
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
+// #warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
    return (ret);                                                     \
  }
-//#warning "Using SSE2 to perform AVX2 integer ops"
+// #warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
 AVX2_INTOP_USING_SSE2(andnot_si128)
 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 */
 v8sf log256_ps(v8sf x) {
  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
-  x = _mm256_max_ps(
+  /* cut off denormalized stuff */
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+  x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_min_norm_pos));
  // can be done with AVX2
  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_mant_mask));
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+  x = _mm256_or_ps(x, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0,
+                              *reinterpret_cast<const v8si *>(_pi32_256_0x7f));
  v8sf e = _mm256_cvtepi32_ps(imm0);
  e = _mm256_add_ps(e, one);
@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
     } else { x = x - 1.0; }
  */
  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf mask = _mm256_cmp_ps(
+      x, *reinterpret_cast<const v8sf *>(_ps256_cephes_SQRTHF), _CMP_LT_OS);
  v8sf tmp = _mm256_and_ps(x, mask);
  x = _mm256_sub_ps(x, one);
  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
  v8sf z = _mm256_mul_ps(x, x);
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p0);
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p1));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p2));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p3));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p4));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p5));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p6));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p7));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p8));
  y = _mm256_mul_ps(y, x);
  y = _mm256_mul_ps(y, z);
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q1));
  y = _mm256_add_ps(y, tmp);
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  y = _mm256_sub_ps(y, tmp);
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q2));
  x = _mm256_add_ps(x, y);
  x = _mm256_add_ps(x, tmp);
  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
  v8sf tmp = _mm256_setzero_ps(), fx;
  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_min_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_hi));
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+  x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_lo));
  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_LOG2EF));
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+  fx = _mm256_add_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  /* how to perform a floorf with SSE: just below */
  // imm0 = _mm256_cvttps_epi32(fx);
@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  tmp =
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+      _mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C1));
+  v8sf z =
+      _mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C2));
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);
  z = _mm256_mul_ps(x, x);
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p0);
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p1));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p2));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p3));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p4));
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p5));
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);
@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
  /* build 2^n */
  imm0 = _mm256_cvttps_epi32(fx);
  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0,
+                              *reinterpret_cast<const v8si *>(_pi32_256_0x7f));
  imm0 = avx2_mm256_slli_epi32(imm0, 23);
  v8sf pow2n = _mm256_castsi256_ps(imm0);
  y = _mm256_mul_ps(y, pow2n);
@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) {  // any x
  sign_bit = x;
  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+  sign_bit = _mm256_and_ps(sign_bit,
+                           *reinterpret_cast<const v8sf *>(_ps256_sign_mask));
  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
  /*
    Here we start a series of integer operations, which are in the
@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) {  // any x
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 =
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
  y = _mm256_cvtepi32_ps(imm2);
  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask
     there is one polynom for 0 <= x <= Pi/4
@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) {  // any x
     Both branches will be computed.
  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 =
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 #else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);
  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) {  // any x
  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) {  // any x
  x = _mm256_add_ps(x, xmm3);
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
  v8sf z = _mm256_mul_ps(x, x);
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);
@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) {  // any x
 #endif
  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
 #ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 =
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+  imm2 =
+      avx2_mm256_sub_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2,
+                                 *reinterpret_cast<const v8si *>(_pi32_256_4));
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 =
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 #else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_1 =
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+      _mm_andnot_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm0_2 =
+      _mm_andnot_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);
  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) {  // any x
  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) {  // any x
  x = _mm256_add_ps(x, xmm3);
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
  v8sf z = _mm256_mul_ps(x, x);
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);
@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  sign_bit_sin = x;
  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+  sign_bit_sin = _mm256_and_ps(
+      sign_bit_sin, *reinterpret_cast<const v8sf *>(_ps256_sign_mask));
  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
 #ifdef __AVX2__
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 =
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;
  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 =
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 // v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);
@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  imm4_1 = imm2_1;
  imm4_2 = imm2_2;
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);
  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  x = _mm256_add_ps(x, xmm3);
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 =
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+      avx2_mm256_sub_epi32(imm4, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm4 = avx2_mm256_andnot_si256(imm4,
+                                 *reinterpret_cast<const v8si *>(_pi32_256_4));
  imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_1 =
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+      _mm_andnot_si128(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm4_2 =
+      _mm_andnot_si128(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);
@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);