From d10a9df7b86d2bef1e144dcf6f6bc12891ad11ba Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Sat, 29 Sep 2018 22:42:31 +0800
Subject: [PATCH] add vaddbias and unit test

---
 paddle/fluid/operators/math/jit_kernel.h      |  6 +++
 .../fluid/operators/math/jit_kernel_blas.cc   | 52 +++++++++++++++++++
 paddle/fluid/operators/math/jit_kernel_exp.cc |  9 ++--
 .../fluid/operators/math/jit_kernel_test.cc   | 39 +++++++++++++-
 4 files changed, 100 insertions(+), 6 deletions(-)
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 24cf2aaf0ba..32944ae82c8 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -82,6 +82,12 @@ class VScalKernel : public Kernel {
   virtual void Compute(const int n, const T a, T *x) const = 0;
 };
 
+template <typename T>
+class VAddBiasKernel : public Kernel {
+ public:
+  virtual void Compute(const int n, const T a, const T *x, T *y) const = 0;
+};
+
 template <typename T>
 class VExpKernel : public Kernel {
  public:
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 30761c0430d..d0ee97a43c6 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -216,9 +216,61 @@ INTRI8_INPLACE_FLOAT(jit::avx512f);
 #undef MKL_FLOAT
 #undef MKL_DOUBLE
 
+/* VAddBias JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class VAddBiasKernelImpl : public VAddBiasKernel<T> {
+ public:
+  void Compute(const int n, const T a, const T* x, T* y) const override {
+    for (int i = 0; i < n; ++i) {
+      y[i] = x[i] + a;
+    }
+  }
+};
+
+#define INTRI8_FLOAT(isa)                                           \
+  template <>                                                       \
+  void VAddBiasKernelImpl<float, isa, kEQ8>::Compute(               \
+      const int n, const float a, const float* x, float* y) const { \
+    __m256 tmp = _mm256_loadu_ps(x);                                \
+    tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a));                    \
+    _mm256_storeu_ps(y, tmp);                                       \
+  }
+
+#define INTRI16_FLOAT(isa)                                          \
+  template <>                                                       \
+  void VAddBiasKernelImpl<float, isa, kEQ16>::Compute(              \
+      const int n, const float a, const float* x, float* y) const { \
+    __m256 tmp0 = _mm256_loadu_ps(x);                               \
+    __m256 tmp1 = _mm256_loadu_ps(x + 8);                           \
+    tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a));                  \
+    tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a));                  \
+    _mm256_storeu_ps(y, tmp0);                                      \
+    _mm256_storeu_ps(y + 8, tmp1);                                  \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+INTRI16_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+INTRI16_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+INTRI16_FLOAT(jit::avx512f);
+#endif
+// TODO(TJ): eq16 test and complete avx512
+
+#undef INTRI8_FLOAT
+#undef INTRI16_FLOAT
+#undef INTRI_GT8LT16_FLOAT
+#undef INTRI_GT16_FLOAT
+
 REGISTER_JITKERNEL(vmul, VMulKernel);
 REGISTER_JITKERNEL(vadd, VAddKernel);
 REGISTER_JITKERNEL(vscal, VScalKernel);
+REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
 
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index 99527d02244..0717c2aeebf 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -221,16 +221,15 @@ INTRI_GT16_FLOAT(jit::avx);
 #ifdef __AVX2__
 INTRI8_FLOAT(jit::avx2);
 INTRI16_FLOAT(jit::avx2);
-INTRI_GT8LT16_FLOAT(jit::avx2);
-INTRI_GT16_FLOAT(jit::avx2);
+// INTRI_GT8LT16_FLOAT(jit::avx2);
+// INTRI_GT16_FLOAT(jit::avx2);
 #endif
 #ifdef __AVX512F__
 INTRI8_FLOAT(jit::avx512f);
 INTRI16_FLOAT(jit::avx512f);
-INTRI_GT8LT16_FLOAT(jit::avx512f);
-INTRI_GT16_FLOAT(jit::avx512f);
+// INTRI_GT8LT16_FLOAT(jit::avx512f);
+// INTRI_GT16_FLOAT(jit::avx512f);
 #endif
-// TODO(TJ): eq16 test and complete avx512
 
 #undef INTRI8_FLOAT
 #undef INTRI16_FLOAT
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 3db9a0b5eb6..7c417871412 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -48,6 +48,43 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
   }
 }
 
+void vaddbias_ref(const int n, const float a, const float* x, float* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] + a;
+  }
+}
+
+TEST(JitKernel, vaddbias) {
+  namespace jit = paddle::operators::math::jitkernel;
+  for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
+    std::vector<float> x(d);
+    std::vector<float> zref(d), ztgt(d);
+    RandomVec<float>(d, x.data(), -2.f, 2.f);
+    const auto& ker =
+        jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
+    const float a = 2.f;
+    const float* x_data = x.data();
+    float* ztgt_data = ztgt.data();
+    float* zref_data = zref.data();
+    auto trefs = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      vaddbias_ref(d, a, x_data, zref_data);
+    }
+    auto trefe = GetCurrentUS();
+    auto ttgts = GetCurrentUS();
+    for (int i = 0; i < repeat; ++i) {
+      ker->Compute(d, a, x_data, ztgt_data);
+    }
+    auto ttgte = GetCurrentUS();
+
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    for (int i = 0; i < d; ++i) {
+      EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
+    }
+  }
+}
+
 void vexp_ref(const int n, const float* x, float* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = std::exp(x[i]);
@@ -135,7 +172,7 @@ void vsigmoid_better(
 
 TEST(JitKernel, vsigmoid) {
   namespace jit = paddle::operators::math::jitkernel;
-  for (int d : {7, 8, 15, 16, 30, 128}) {
+  for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
     RandomVec<float>(d, x.data(), -2.f, 2.f);
-- 
GitLab