提交 01dd9061 编写于 作者: P peizhilin

add avx support for windows

test=develop
上级 363bf8a4
...@@ -131,8 +131,6 @@ if (APPLE OR WIN32) ...@@ -131,8 +131,6 @@ if (APPLE OR WIN32)
endif() endif()
if (WIN32) if (WIN32)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when compiling for Windows" FORCE)
set(WITH_DSO OFF CACHE STRING set(WITH_DSO OFF CACHE STRING
"Disable DSO when compiling for Windows" FORCE) "Disable DSO when compiling for Windows" FORCE)
set(WITH_MKL OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
......
...@@ -18,9 +18,6 @@ limitations under the License. */ ...@@ -18,9 +18,6 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h" #include "paddle/fluid/platform/dynload/mklml.h"
......
...@@ -15,14 +15,10 @@ limitations under the License. */ ...@@ -15,14 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h> #include <math.h>
#include <string> #include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
......
...@@ -14,10 +14,8 @@ limitations under the License. */ ...@@ -14,10 +14,8 @@ limitations under the License. */
#ifdef __AVX__ #ifdef __AVX__
#include <immintrin.h>
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
// TODO(qingqing) refine this dependence #include "paddle/fluid/operators/math/detail/avx_mathfun.h"
#include "paddle/legacy/cuda/src/avx_mathfun.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
此差异已折叠。
...@@ -113,26 +113,27 @@ void VXXJitCode::generate() { ...@@ -113,26 +113,27 @@ void VXXJitCode::generate() {
ret(); ret();
} }
const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f), const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
REPEAT_8TIMES(2.f), REPEAT_8TIMES(1.f),
REPEAT_8TIMES(0.5f), REPEAT_8TIMES(2.f),
REPEAT_8TIMES(EXP_HIG), REPEAT_8TIMES(0.5f),
REPEAT_8TIMES(EXP_LOW), REPEAT_8TIMES(EXP_HIG),
REPEAT_8TIMES(CEPHES_LOG2EF), REPEAT_8TIMES(EXP_LOW),
REPEAT_8TIMES(CEPHES_EXP_C1), REPEAT_8TIMES(CEPHES_LOG2EF),
REPEAT_8TIMES(CEPHES_EXP_C2), REPEAT_8TIMES(CEPHES_EXP_C1),
REPEAT_8TIMES(CEPHES_EXP_P0), REPEAT_8TIMES(CEPHES_EXP_C2),
REPEAT_8TIMES(CEPHES_EXP_P1), REPEAT_8TIMES(CEPHES_EXP_P0),
REPEAT_8TIMES(CEPHES_EXP_P2), REPEAT_8TIMES(CEPHES_EXP_P1),
REPEAT_8TIMES(CEPHES_EXP_P3), REPEAT_8TIMES(CEPHES_EXP_P2),
REPEAT_8TIMES(CEPHES_EXP_P4), REPEAT_8TIMES(CEPHES_EXP_P3),
REPEAT_8TIMES(CEPHES_EXP_P5), REPEAT_8TIMES(CEPHES_EXP_P4),
REPEAT_8TIMES(EXP_MAX_INPUT), REPEAT_8TIMES(CEPHES_EXP_P5),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX), REPEAT_8TIMES(EXP_MAX_INPUT),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)}; REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
const int exp_int_0x7f[] ALIGN32 = {REPEAT_8TIMES(0x7f)}; const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
int g_tmp_mem[16] ALIGN32 = {0}; int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
bool VActJitCode::init(int d, operand_type type) { bool VActJitCode::init(int d, operand_type type) {
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256 // TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
......
...@@ -47,7 +47,6 @@ extern const float exp_float_consts[]; ...@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
extern const int exp_int_0x7f[]; extern const int exp_int_0x7f[];
extern int g_tmp_mem[]; extern int g_tmp_mem[];
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f #define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f #define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341 #define CEPHES_LOG2EF 1.44269504088896341
......
...@@ -16,9 +16,6 @@ limitations under the License. */ ...@@ -16,9 +16,6 @@ limitations under the License. */
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h" #include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
/* AVX instructions.*/ \ /* AVX instructions.*/ \
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \ __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \ __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \ __m128i lo_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 0); \
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \ __m128i hi_mask = _mm256_extractf128_si256(*(__m256i*)&mask, 1); \
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \ lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \ hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \ lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \
......
...@@ -13,9 +13,6 @@ limitations under the License. */ ...@@ -13,9 +13,6 @@ limitations under the License. */
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h" #include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
if (rest_ != 0) { \ if (rest_ != 0) { \
j = offset + this->num_ - block; \ j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)x + j); \ tmp = _mm256_loadu_ps((const float*)x + j); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
sum = _mm256_add_ps(sum, tmp); \ sum = _mm256_add_ps(sum, tmp); \
} \ } \
hi = _mm256_extractf128_ps(sum, 1); \ hi = _mm256_extractf128_ps(sum, 1); \
...@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
j = offset + this->num_ - block; \ j = offset + this->num_ - block; \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \ tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_mul_ps(tmp, tmp); \ tmp = _mm256_mul_ps(tmp, tmp); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, (__m256)mask_vec); \ tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp, *(__m256*)&mask_vec); \
sum = _mm256_add_ps(sum, tmp); \ sum = _mm256_add_ps(sum, tmp); \
} \ } \
hi = _mm256_extractf128_ps(sum, 1); \ hi = _mm256_extractf128_ps(sum, 1); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册