add avx support for windows

test=develop

add avx support for windows
test=develop
01dd9061 · peizhilin · 363bf8a4 · 01dd9061 · 01dd9061 · 01dd9061
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,8 +131,6 @@ if (APPLE OR WIN32)
 endif()
 if (WIN32)
-    set(WITH_AVX OFF CACHE STRING
-            "Disable AVX when compiling for Windows" FORCE)
    set(WITH_DSO OFF CACHE STRING
            "Disable DSO when compiling for Windows" FORCE)
    set(WITH_MKL OFF CACHE STRING

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"

--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,14 +15,10 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -14,10 +14,8 @@ limitations under the License. */
 #ifdef __AVX__
-#include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
-// TODO(qingqing) refine this dependence
+#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
-#include "paddle/legacy/cuda/src/avx_mathfun.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/detail/avx_mathfun.h
+++ b/paddle/fluid/operators/math/detail/avx_mathfun.h
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -113,26 +113,27 @@ void VXXJitCode::generate() {
  ret();
 }
-const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
+const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
-                                          REPEAT_8TIMES(2.f),
+    REPEAT_8TIMES(1.f),
-                                          REPEAT_8TIMES(0.5f),
+    REPEAT_8TIMES(2.f),
-                                          REPEAT_8TIMES(EXP_HIG),
+    REPEAT_8TIMES(0.5f),
-                                          REPEAT_8TIMES(EXP_LOW),
+    REPEAT_8TIMES(EXP_HIG),
-                                          REPEAT_8TIMES(CEPHES_LOG2EF),
+    REPEAT_8TIMES(EXP_LOW),
-                                          REPEAT_8TIMES(CEPHES_EXP_C1),
+    REPEAT_8TIMES(CEPHES_LOG2EF),
-                                          REPEAT_8TIMES(CEPHES_EXP_C2),
+    REPEAT_8TIMES(CEPHES_EXP_C1),
-                                          REPEAT_8TIMES(CEPHES_EXP_P0),
+    REPEAT_8TIMES(CEPHES_EXP_C2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P1),
+    REPEAT_8TIMES(CEPHES_EXP_P0),
-                                          REPEAT_8TIMES(CEPHES_EXP_P2),
+    REPEAT_8TIMES(CEPHES_EXP_P1),
-                                          REPEAT_8TIMES(CEPHES_EXP_P3),
+    REPEAT_8TIMES(CEPHES_EXP_P2),
-                                          REPEAT_8TIMES(CEPHES_EXP_P4),
+    REPEAT_8TIMES(CEPHES_EXP_P3),
-                                          REPEAT_8TIMES(CEPHES_EXP_P5),
+    REPEAT_8TIMES(CEPHES_EXP_P4),
-                                          REPEAT_8TIMES(EXP_MAX_INPUT),
+    REPEAT_8TIMES(CEPHES_EXP_P5),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(EXP_MAX_INPUT),
-                                          REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
+    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)};
+const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
-int g_tmp_mem[16] ALIGN32 = {0};
+int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
 bool VActJitCode::init(int d, operand_type type) {
  // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
-#define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
 #define CEPHES_LOG2EF 1.44269504088896341

--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 namespace paddle {
 namespace operators {
@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
          /* AVX instructions.*/                                               \
          __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
          __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
-          __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);        \
+          __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0);     \
-          __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);        \
+          __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1);     \
          lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
          hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
          lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \

--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -13,9 +13,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 namespace paddle {
 namespace operators {
@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
      if (rest_ != 0) {                                                        \
        j = offset + this->num_ - block;                                       \
        tmp = _mm256_loadu_ps((const float*)x + j);                            \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
        sum = _mm256_add_ps(sum, tmp);                                         \
      }                                                                        \
      hi = _mm256_extractf128_ps(sum, 1);                                      \
@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
        j = offset + this->num_ - block;                                       \
        tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);   \
        tmp = _mm256_mul_ps(tmp, tmp);                                         \
-        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec);    \
+        tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
        sum = _mm256_add_ps(sum, tmp);                                         \
      }                                                                        \
      hi = _mm256_extractf128_ps(sum, 1);                                      \