From e3bb98eb38f8938ee3a0f8b07d8f486aca6ccfe3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 24 Aug 2018 00:01:09 +0800 Subject: [PATCH] optimize relu with avx and avx512 --- paddle/fluid/operators/math/cpu_vec.h | 83 ++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 6d8acbe539..e74e84055a 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -31,6 +31,13 @@ namespace math { #define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MAX 13.0 +#define AVX_FLOAT_BLOCK 8 +#define AVX_DOUBLE_BLOCK 4 +#define AVX2_FLOAT_BLOCK 8 +#define AVX2_DOUBLE_BLOCK 4 +#define AVX512_FLOAT_BLOCK 16 +#define AVX512_DOUBLE_BLOCK 8 + template inline void vec_exp(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { @@ -88,24 +95,82 @@ inline void vec_relu(const int n, const T* x, T* y) { } } +template <> +inline void vec_relu(const int n, const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_relu(n, x, y); + return; + } + + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } + if (rest == 0) { + return; + } + i = n - block; + MOVE_ONE_STEP; +#undef MOVE_ONE_STEP + +#else + vec_relu(n, x, y); +#endif +} + template <> inline void vec_relu(const int n, const float* x, float* y) { - // TODO(TJ): complete me - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - // TODO(TJ): complete me - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; +inline void vec_relu(const int n, + const float* x, + float* y) { +#ifdef __AVX512F__ + // test me + constexpr int block = AVX512_FLOAT_BLOCK; + if (n < block) { + vec_relu(n, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m512 zeros = _mm512_setzero_ps(); + __m512 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm512_loadu_ps(x + i); \ + tmp = _mm512_max_ps(tmp, zeros); \ + _mm512_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; } + if (rest == 0) { + return; + } + i = n - block; + MOVE_ONE_STEP; +#undef MOVE_ONE_STEP +#else + vec_relu(n, x, y); +#endif } +// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary + template class VecActivations { public: -- GitLab