From 084893a9a9d8fed901f0d19630bf021137fba235 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 27 Sep 2018 15:00:00 +0800
Subject: [PATCH] add vadd kernel

---
 paddle/fluid/operators/math/jit_kernel.cc     |  46 ++++---
 paddle/fluid/operators/math/jit_kernel.h      |   9 ++
 .../fluid/operators/math/jit_kernel_blas.cc   | 114 +++++++++++++++---
 .../fluid/operators/math/jit_kernel_test.cc   |  23 ++--
 4 files changed, 148 insertions(+), 44 deletions(-)
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
index 4fd1d179427..8859c0f7d8f 100644
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
+#include <iostream>
 #include <string>
 
 namespace paddle {
@@ -27,29 +28,35 @@ KernelPool& KernelPool::Instance() {
   return g_jit_kernels;
 }
 
-template <>
-const std::shared_ptr<VMulKernel<float>> KernelPool::Get<VMulKernel<float>>(
-    int d) {
-  std::string key = "f" + std::to_string(d);
+const std::shared_ptr<Kernel> KernelPool::Get(const std::string& key) const {
   if (kers_.find(key) == kers_.end()) {
-    auto p = std::make_shared<VMulKernel<float>>(d);
-    kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});
-    return p;
+    return nullptr;
   }
-  return std::dynamic_pointer_cast<VMulKernel<float>>(kers_.at(key));
+  return kers_.at(key);
 }
 
-template <>
-const std::shared_ptr<VMulKernel<double>> KernelPool::Get<VMulKernel<double>>(
-    int d) {
-  std::string key = "d" + std::to_string(d);
-  if (kers_.find(key) == kers_.end()) {
-    auto p = std::make_shared<VMulKernel<double>>(d);
-    kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});
-    return p;
+#define DEFINE_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key)        \
+  template <>                                                              \
+  const std::shared_ptr<ker_class<ker_dtype>>                              \
+  KernelPool::Get<ker_class<ker_dtype>>(int d) {                           \
+    std::string key = #ker_key #dtype_key + std::to_string(d);             \
+    if (kers_.find(key) == kers_.end()) {                                  \
+      auto p = std::make_shared<ker_class<ker_dtype>>(d);                  \
+      kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)});           \
+      return p;                                                            \
+    }                                                                      \
+    return std::dynamic_pointer_cast<ker_class<ker_dtype>>(kers_.at(key)); \
   }
-  return std::dynamic_pointer_cast<VMulKernel<double>>(kers_.at(key));
-}
+
+#define REGISTER_BLAS_JITKERNEL(ker_key, ker_class) \
+  DEFINE_WITH_DTYPE(ker_key, ker_class, float, f);  \
+  DEFINE_WITH_DTYPE(ker_key, ker_class, double, d)
+
+REGISTER_BLAS_JITKERNEL(vmul, VMulKernel);
+REGISTER_BLAS_JITKERNEL(vadd, VAddKernel);
+
+#undef REGISTER_BLAS_JITKERNEL
+#undef DEFINE_WITH_DTYPE
 
 template <>
 const std::shared_ptr<LSTMKernel<float>>
@@ -57,7 +64,8 @@ KernelPool::Get<LSTMKernel<float>, int, const std::string&, const std::string&,
                 const std::string&>(int d, const std::string& act_gate,
                                     const std::string& act_cand,
                                     const std::string& act_cell) {
-  std::string key = "f" + std::to_string(d) + act_gate + act_cand + act_cell;
+  std::string key =
+      "lstmf" + std::to_string(d) + act_gate + act_cand + act_cell;
   if (kers_.find(key) == kers_.end()) {
     auto p =
         std::make_shared<LSTMKernel<float>>(d, act_gate, act_cand, act_cell);
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 3849d29040b..610f6714041 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -54,6 +54,8 @@ class KernelPool {
   template <typename Ker, typename... ARGS>
   const std::shared_ptr<Ker> Get(ARGS... args);
 
+  const std::shared_ptr<Kernel> Get(const std::string &key) const;
+
  private:
   KernelPool() = default;
   std::unordered_map<std::string, std::shared_ptr<Kernel>> kers_;
@@ -68,6 +70,13 @@ class VMulKernel : public Kernel {
   void (*Compute)(const int n, const T *, const T *, T *);
 };
 
+template <typename T>
+class VAddKernel : public Kernel {
+ public:
+  explicit VAddKernel(int n);
+  void (*Compute)(const int n, const T *, const T *, T *);
+};
+
 template <typename T>
 class LSTMKernel : public Kernel {
  public:
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index 29394e31893..4ce60ffc043 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -74,15 +74,22 @@ namespace jit = platform::jit;
   FOR_EACH_ALL_BLOCK(macro_, jit::avx)     \
   FOR_EACH_ALL_BLOCK(macro_, jit::any)
 
-/* VMUL JitKernel */
-#define VMUL_ANY                \
-  for (int i = 0; i < n; ++i) { \
-    z[i] = x[i] * y[i];         \
+#define BIND_KERNEL_WITH_DTYPE(ker_class, ker_func, ker_dtype) \
+  template <>                                                  \
+  ker_class<ker_dtype>::ker_class(int d) {                     \
+    SEARCH_ISA_BLOCK(ker_func, ker_dtype);                     \
   }
 
+#define BIND_KERNEL(ker_class, ker_func)              \
+  BIND_KERNEL_WITH_DTYPE(ker_class, ker_func, float); \
+  BIND_KERNEL_WITH_DTYPE(ker_class, ker_func, double)
+
+/* VMUL JitKernel */
 template <typename T, platform::jit::cpu_isa_t isa, jit_block>
 static void VMulCompute(const int n, const T* x, const T* y, T* z) {
-  VMUL_ANY
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
 }
 
 #ifdef PADDLE_USE_MKLML
@@ -107,6 +114,8 @@ FOR_EACH_ISA_ALL_BLOCK(VMUL_MKL_DOUBLE)
 /// lt8
 #ifdef PADDLE_USE_MKLML
 VMUL_MKL_FLOAT(jit::avx, kLT8)
+VMUL_MKL_FLOAT(jit::avx2, kLT8)
+VMUL_MKL_FLOAT(jit::avx512f, kLT8)
 #endif
 
 /// eq8
@@ -143,20 +152,93 @@ VMUL_MKL_FLOAT(jit::avx2, kEQ16)
 VMUL_MKL_FLOAT(jit::avx512f, kEQ16)
 #endif
 
-#define USE_VMUL_KERNEL(T, func)     \
-  template <>                        \
-  VMulKernel<T>::VMulKernel(int d) { \
-    SEARCH_ISA_BLOCK(func, T);       \
-  }
-
-USE_VMUL_KERNEL(float, VMulCompute);
-USE_VMUL_KERNEL(double, VMulCompute);
-
-#undef VMUL_ANY
 #undef VMUL_INTRI8_FLOAT
 #undef VMUL_MKL_FLOAT
 #undef VMUL_MKL_DOUBLE
-#undef USE_VMUL_KERNEL
+
+/* VADD */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+static void VAddCompute(const int n, const T* x, const T* y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+#ifdef PADDLE_USE_MKLML
+#define VADD_MKL_FLOAT(isa, block)                                 \
+  template <>                                                      \
+  void VAddCompute<float, isa, block>(const int n, const float* x, \
+                                      const float* y, float* z) {  \
+    platform::dynload::vsAdd(n, x, y, z);                          \
+  }
+
+#define VADD_MKL_DOUBLE(isa, block)                                  \
+  template <>                                                        \
+  void VAddCompute<double, isa, block>(const int n, const double* x, \
+                                       const double* y, float* z) {  \
+    platform::dynload::vdAdd(n, x, y, z);                            \
+  }
+
+FOR_EACH_ISA_COMMON_BLOCK(VADD_MKL_FLOAT)
+FOR_EACH_ISA_ALL_BLOCK(VADD_MKL_DOUBLE)
+#endif
+
+/// lt8
+#ifdef PADDLE_USE_MKLML
+VADD_MKL_FLOAT(jit::avx, kLT8)
+VADD_MKL_FLOAT(jit::avx2, kLT8)
+VADD_MKL_FLOAT(jit::avx512f, kLT8)
+#endif
+
+/// eq8
+#define VADD_INTRI8_FLOAT(isa)                                    \
+  template <>                                                     \
+  void VAddCompute<float, isa, kEQ8>(const int n, const float* x, \
+                                     const float* y, float* z) {  \
+    __m256 tmpx, tmpy;                                            \
+    tmpx = _mm256_loadu_ps(x);                                    \
+    tmpy = _mm256_loadu_ps(y);                                    \
+    tmpx = _mm256_add_ps(tmpx, tmpy);                             \
+    _mm256_storeu_ps(z, tmpx);                                    \
+  }
+
+// mkl > avx > for, ">" means better
+#ifdef PADDLE_USE_MKLML
+VADD_MKL_FLOAT(jit::avx, kEQ8)
+#elif defined __AVX__
+VADD_INTRI8_FLOAT(jit::avx)
+#endif
+// avx2 > mkl > for
+#ifdef __AVX2__
+VADD_INTRI8_FLOAT(jit::avx2)
+#elif defined PADDLE_USE_MKLML
+VADD_MKL_FLOAT(jit::avx2, kEQ8)
+#endif
+// TODO(TJ): test and complete avx512
+
+/// eq16
+#ifdef PADDLE_USE_MKLML
+// TODO(TJ): test and complete me
+VADD_MKL_FLOAT(jit::avx, kEQ16)
+VADD_MKL_FLOAT(jit::avx2, kEQ16)
+VADD_MKL_FLOAT(jit::avx512f, kEQ16)
+#endif
+
+#undef VADD_INTRI8_FLOAT
+#undef VADD_MKL_FLOAT
+#undef VADD_MKL_DOUBLE
+
+BIND_KERNEL(VMulKernel, VMulCompute);
+BIND_KERNEL(VAddKernel, VAddCompute);
+
+#undef BIND_KERNEL
+#undef BIND_KERNEL_WITH_DTYPE
+#undef FOR_EACH_ISA_ALL_BLOCK
+#undef FOR_EACH_ALL_BLOCK
+#undef FOR_EACH_ISA_COMMON_BLOCK
+#undef FOR_EACH_COMMON_BLOCK
+#undef SEARCH_ISA_BLOCK
+#undef SEARCH_BLOCK
 
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 041234442d3..6b250291018 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -23,25 +23,30 @@ TEST(JitKernel, pool) {
   namespace jit = paddle::operators::math::jitkernel;
   const int frame_size = 4;
   std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
-  const auto& p1 =
+  const auto& plstm1 =
       jit::KernelPool::Instance()
           .template Get<jit::LSTMKernel<float>, int, const std::string&,
                         const std::string&, const std::string&>(
               frame_size, act_gate, act_cand, act_cell);
-  const auto& p2 =
+  const auto& plstm2 =
       jit::KernelPool::Instance()
           .template Get<jit::LSTMKernel<float>, int, const std::string&,
                         const std::string&, const std::string&>(
               frame_size, act_gate, act_cand, act_cell);
-  EXPECT_EQ(p1, p2);
+  EXPECT_EQ(plstm1, plstm2);
 
-  const auto& p3 =
+  const auto& pvmul_f =
       jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(4);
-  EXPECT_TRUE(std::dynamic_pointer_cast<jit::Kernel>(p2) !=
-              std::dynamic_pointer_cast<jit::Kernel>(p3));
+  EXPECT_TRUE(std::dynamic_pointer_cast<jit::Kernel>(plstm2) !=
+              std::dynamic_pointer_cast<jit::Kernel>(pvmul_f));
 
-  const auto& p4 =
+  const auto& pvmul_d =
       jit::KernelPool::Instance().template Get<jit::VMulKernel<double>>(4);
-  EXPECT_TRUE(std::dynamic_pointer_cast<jit::Kernel>(p3) !=
-              std::dynamic_pointer_cast<jit::Kernel>(p4));
+  EXPECT_TRUE(std::dynamic_pointer_cast<jit::Kernel>(pvmul_f) !=
+              std::dynamic_pointer_cast<jit::Kernel>(pvmul_d));
+
+  const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4");
+  EXPECT_TRUE(pvmul_f == pvmul_from_key);
+  const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5");
+  EXPECT_TRUE(pvmul_from_key2 == nullptr);
 }
-- 
GitLab