From 7a4924cd44a47f3562d62c01d0c40e84ca78540e Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 24 Aug 2018 11:46:59 +0800 Subject: [PATCH] further optimize sigmoid with avx and avx512 --- paddle/fluid/operators/math/cpu_vec.h | 116 ++++++++++++++++++++ paddle/fluid/operators/math/cpu_vec_test.cc | 6 +- 2 files changed, 119 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index a2e2b5a7fe..52f072eb0e 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,6 +77,122 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } } +template <> +inline void vec_sigmoid(const int n, const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { // can use larger threshold if necessary + vec_sigmoid(n, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(zeros, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } + if (rest != 0) { + i = n - block; + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + + vec_exp(n, y, y); + + __m256 ones = _mm256_set1_ps(1.0f); +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(y + i); \ + tmp = _mm256_add_ps(ones, tmp); \ + tmp = _mm256_div_ps(ones, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step + for (i = n - rest; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +#else + vec_sigmoid(n, x, y); +#endif +} + +template <> +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); +} + +template <> +inline void vec_sigmoid(const int n, + const float* x, + float* y) { +#ifdef __AVX512F__ + constexpr int block = AVX512_FLOAT_BLOCK; + if (n < block) { + vec_sigmoid(n, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m512 max = _mm512_set1_ps(SIGMOID_THRESHOLD_MAX); + __m512 min = _mm512_set1_ps(SIGMOID_THRESHOLD_MIN); + __m512 zeros = _mm512_setzero_ps(); + __m512 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm512_loadu_ps(x + i); \ + tmp = _mm512_max_ps(tmp, min); \ + tmp = _mm512_min_ps(tmp, max); \ + tmp = _mm512_sub_ps(zeros, tmp); \ + _mm512_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } + if (rest != 0) { + i = n - block; + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + + vec_exp(n, y, y); + + __m512 ones = _mm512_set1_ps(1.0f); +#define MOVE_ONE_STEP \ + tmp = _mm512_loadu_ps(y + i); \ + tmp = _mm512_add_ps(ones, tmp); \ + tmp = _mm512_div_ps(ones, tmp); \ + _mm512_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + for (i = n - rest; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +#else + vec_sigmoid(n, x, y); +#endif +} + template inline void vec_tanh(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 0888e44fa6..8b0e9c086a 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,7 +104,7 @@ void TestAndBench(const int n, std::function tgt, TEST(CpuVecTest, sigmoid) { namespace jit = paddle::platform::jit; using namespace paddle::operators::math; // NOLINT - for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) { + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); @@ -117,7 +117,7 @@ TEST(CpuVecTest, sigmoid) { TEST(CpuVecTest, tanh) { namespace jit = paddle::platform::jit; using namespace paddle::operators::math; // NOLINT - for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) { + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); @@ -130,7 +130,7 @@ TEST(CpuVecTest, tanh) { TEST(CpuVecTest, relu) { namespace jit = paddle::platform::jit; using namespace paddle::operators::math; // NOLINT - for (auto sz : {1, 2, 15, 16, 32, 128, 200, 512}) { + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); -- GitLab