From 3c8b651187e569dd22b7dbe2a0e7cff436c4ee88 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Sat, 29 Sep 2018 20:46:44 +0800
Subject: [PATCH] add vsigmoid avx implementations and unit test

---
 paddle/fluid/operators/math/jit_kernel_exp.cc | 106 ++++++++++++++++++
 .../fluid/operators/math/jit_kernel_test.cc   |  67 +++++++++++
 2 files changed, 173 insertions(+)
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index 0c736cd2d..99527d022 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -132,6 +132,111 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
   std::shared_ptr<const VExpKernel<T>> vexp_;
 };
 
+#define INTRI_SIGMOID(tmp, min, max)              \
+  tmp = _mm256_max_ps(tmp, min);                  \
+  tmp = _mm256_min_ps(tmp, max);                  \
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
+  tmp = detail::Exp(tmp);                         \
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
+
+#define INTRI8_FLOAT(isa)                               \
+  template <>                                           \
+  void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(   \
+      const int n, const float* x, float* y) const {    \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
+    __m256 tmp = _mm256_loadu_ps(x);                    \
+    INTRI_SIGMOID(tmp, min, max);                       \
+    _mm256_storeu_ps(y, tmp);                           \
+  }
+
+#define INTRI16_FLOAT(isa)                              \
+  template <>                                           \
+  void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(  \
+      const int n, const float* x, float* y) const {    \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
+    __m256 tmp0 = _mm256_loadu_ps(x);                   \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);               \
+    INTRI_SIGMOID(tmp0, min, max);                      \
+    INTRI_SIGMOID(tmp1, min, max);                      \
+    _mm256_storeu_ps(y, tmp0);                          \
+    _mm256_storeu_ps(y + 8, tmp1);                      \
+  }
+
+#define INTRI_GT8LT16_FLOAT(isa)                                   \
+  template <>                                                      \
+  void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(          \
+      const int n, const float* x, float* y) const {               \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);            \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);            \
+    __m256 tmp = _mm256_loadu_ps(x);                               \
+    INTRI_SIGMOID(tmp, min, max);                                  \
+    _mm256_storeu_ps(y, tmp);                                      \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                      \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                      \
+    for (int i = AVX_FLOAT_BLOCK; i < n; ++i) {                    \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
+      y[i] = 0.f - y[i];                                           \
+    }                                                              \
+    vexp_->Compute(n - AVX_FLOAT_BLOCK, y + AVX_FLOAT_BLOCK,       \
+                   y + AVX_FLOAT_BLOCK);                           \
+    for (int i = AVX_FLOAT_BLOCK; i < n; ++i) {                    \
+      y[i] = 1.f / (1.f + y[i]);                                   \
+    }                                                              \
+  }
+
+#define INTRI_GT16_FLOAT(isa)                                      \
+  template <>                                                      \
+  void VSigmoidKernelImpl<float, isa, kGT16>::Compute(             \
+      const int n, const float* x, float* y) const {               \
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);            \
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);            \
+    const int rest = n % AVX_FLOAT_BLOCK;                          \
+    const int end = n - rest;                                      \
+    for (int i = 0; i < end; i += AVX_FLOAT_BLOCK) {               \
+      __m256 tmp = _mm256_loadu_ps(x + i);                         \
+      INTRI_SIGMOID(tmp, min, max);                                \
+      _mm256_storeu_ps(y + i, tmp);                                \
+    }                                                              \
+    const float min_ = SIGMOID_THRESHOLD_MIN;                      \
+    const float max_ = SIGMOID_THRESHOLD_MAX;                      \
+    for (int i = end; i < n; ++i) {                                \
+      y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
+      y[i] = 0.f - y[i];                                           \
+    }                                                              \
+    vexp_->Compute(rest, y + end, y + end);                        \
+    for (int i = end; i < n; ++i) {                                \
+      y[i] = 1.f / (1.f + y[i]);                                   \
+    }                                                              \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+INTRI_GT8LT16_FLOAT(jit::avx);
+INTRI_GT16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+INTRI_GT8LT16_FLOAT(jit::avx2);
+INTRI_GT16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+INTRI_GT8LT16_FLOAT(jit::avx512f);
+INTRI_GT16_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+
 #define JITKERNEL_NEW_ACT_IMPL(ker, dtype, isa, k) \
   p = std::dynamic_pointer_cast<ker<dtype>>(       \
       std::make_shared<ker##Impl<dtype, isa, k>>(d))
@@ -140,6 +245,7 @@ REGISTER_JITKERNEL_ARGS(vsigmoid, VSigmoidKernel, JITKERNEL_DECLARE,
                         JITKERNEL_KEY, JITKERNEL_NEW_ACT_IMPL);
 
 #undef JITKERNEL_NEW_ACT_IMPL
+
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 2495712cb..3db9a0b5e 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -104,6 +104,73 @@ TEST(JitKernel, vexp) {
   }
 }
 
+inline float _sigmoid(float x) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  float tmp = (x < min) ? min : ((x > max) ? max : x);
+  return 1.f / (1.f + std::exp(-tmp));
+}
+
+void vsigmoid_ref(const int n, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = _sigmoid(x[i]);
+  }
+}
+
+void vsigmoid_better(
+    const std::shared_ptr<
+        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
+    const int n, const float* x, float* y) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = 0.f - y[i];
+  }
+  vexp->Compute(n, y, y);
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.f / (1.f + y[i]);
+  }
+}
+
+TEST(JitKernel, vsigmoid) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 128}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(d);
+    const auto& vexp =
+        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(d);
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto tmkls = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_better(vexp, d, x_data, zref_data);
+    }
+    auto tmkle = GetCurrentUS();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vsigmoid_ref(d, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(d, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
 void vscal_ref(const int n, const float a, const float* x, float* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = a * x[i];
-- 
GitLab