From eeff268a6c0f9ed75344189e347d5956d38a4b9e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 26 Sep 2018 17:37:31 +0800
Subject: [PATCH] clean and refine kernels

---
 paddle/fluid/operators/math/CMakeLists.txt    |   2 +-
 paddle/fluid/operators/math/jit_kernel.cc     | 165 ------------------
 paddle/fluid/operators/math/jit_kernel.h      |   2 -
 .../fluid/operators/math/jit_kernel_blas.cc   | 164 +++++++++++++++++
 paddle/fluid/operators/math/jit_kernel_impl.h |  27 ---
 .../fluid/operators/math/jit_kernel_lstm.cc   |  76 ++++++++
 6 files changed, 241 insertions(+), 195 deletions(-)
 create mode 100644 paddle/fluid/operators/math/jit_kernel_blas.cc
 delete mode 100644 paddle/fluid/operators/math/jit_kernel_impl.h
 create mode 100644 paddle/fluid/operators/math/jit_kernel_lstm.cc
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 4678b008d79..9763d14d54a 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -76,5 +76,5 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel SRCS jit_kernel.cc DEPS cpu_info cblas)
+cc_library(jit_kernel SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_lstm.cc DEPS cpu_info cblas)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
index 71b1ffc6670..4fd1d179427 100644
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -13,17 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <functional>
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -36,115 +26,6 @@ KernelPool& KernelPool::Instance() {
   static KernelPool g_jit_kernels;
   return g_jit_kernels;
 }
-#define SEARCH_BLOCK(src, t, isa)                             \
-  if (d < AVX_FLOAT_BLOCK) {                                  \
-    Compute = src<t, isa, kLT8>;                              \
-  } else if (d == AVX_FLOAT_BLOCK) {                          \
-    Compute = src<t, isa, kEQ8>;                              \
-  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
-    Compute = src<t, isa, kGT8LT16>;                          \
-  } else if (d == AVX512_FLOAT_BLOCK) {                       \
-    Compute = src<t, isa, kEQ16>;                             \
-  } else {                                                    \
-    Compute = src<t, isa, kGT16>;                             \
-  }
-
-#define SEARCH_ISA_BLOCK(src, t)        \
-  if (jit::MayIUse(jit::avx512f)) {     \
-    SEARCH_BLOCK(src, t, jit::avx512f); \
-  } else if (jit::MayIUse(jit::avx2)) { \
-    SEARCH_BLOCK(src, t, jit::avx2);    \
-  } else if (jit::MayIUse(jit::avx)) {  \
-    SEARCH_BLOCK(src, t, jit::avx);     \
-  } else {                              \
-    SEARCH_BLOCK(src, t, jit::isa_any); \
-  }
-
-// do not include lt8, eq8, eq16
-#define FOR_EACH_COMMON_BLOCK(macro_, isa) \
-  macro_(isa, kGT8LT16) macro_(isa, kGT16)
-
-#define FOR_EACH_ISA_COMMON_BLOCK(macro_) \
-  FOR_EACH_BLOCK(macro_, jit::avx512f)    \
-  FOR_EACH_BLOCK(macro_, jit::avx2)       \
-  FOR_EACH_BLOCK(macro_, jit::avx)        \
-  FOR_EACH_BLOCK(macro_, jit::any)
-
-#define VMUL_ANY                \
-  for (int i = 0; i < n; ++i) { \
-    z[i] = x[i] * y[i];         \
-  }
-
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
-static void VMulCompute(const int n, const T* x, const T* y, T* z) {
-  VMUL_ANY
-}
-
-#ifdef PADDLE_USE_MKLML
-#define DEFINE_VMUL_COMPUTE_FLOAT(isa, block)                      \
-  template <>                                                      \
-  void VMulCompute<float, isa, block>(const int n, const float* x, \
-                                      const float* y, float* z) {  \
-    platform::dynload::vsMul(n, x, y, z);                          \
-  }
-
-#define DEFINE_VMUL_COMPUTE_DOUBLE(isa, block)                       \
-  template <>                                                        \
-  void VMulCompute<double, isa, block>(const int n, const double* x, \
-                                       const double* y, float* z) {  \
-    platform::dynload::vdMul(n, x, y, z);                            \
-  }
-
-FOR_EACH_ISA_COMMON_BLOCK(DEFINE_VMUL_COMPUTE_FLOAT)
-FOR_EACH_ISA_COMMON_BLOCK(DEFINE_VMUL_COMPUTE_DOUBLE)
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kLT8)
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kEQ16)
-#endif
-
-// mkl > avx > for, ">" means better
-#ifdef PADDLE_USE_MKLML
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kEQ8)
-#elif defined __AVX__
-template <>
-void VMulCompute<float, jit::avx, kEQ8>(const int n, const float* x,
-                                        const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_mul_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#endif
-
-// avx2 > mkl > for
-#ifdef __AVX2__
-template <>
-void VMulCompute<float, jit::avx2, kEQ8>(const int n, const float* x,
-                                         const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_mul_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#elif defined PADDLE_USE_MKLML
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx2, kEQ8)
-#endif
-// TODO(TJ): test and complete avx512
-
-#undef DEFINE_VMUL_COMPUTE_FLOAT
-#undef DEFINE_VMUL_COMPUTE_DOUBLE
-#undef VMUL_ANY
-
-template <>
-VMulKernel<float>::VMulKernel(int d) {
-  SEARCH_ISA_BLOCK(VMulCompute, float);
-}
-
-template <>
-VMulKernel<double>::VMulKernel(int d) {
-  SEARCH_ISA_BLOCK(VMulCompute, double);
-}
 
 template <>
 const std::shared_ptr<VMulKernel<float>> KernelPool::Get<VMulKernel<float>>(
@@ -170,52 +51,6 @@ const std::shared_ptr<VMulKernel<double>> KernelPool::Get<VMulKernel<double>>(
   return std::dynamic_pointer_cast<VMulKernel<double>>(kers_.at(key));
 }
 
-template <>
-LSTMKernel<float>::LSTMKernel(int d, const std::string& act_gate_str,
-                              const std::string& act_cand_str,
-                              const std::string& act_cell_str)
-    : Kernel(), d_(d) {
-  d2_ = d * 2;
-  d3_ = d * 3;
-  if (platform::jit::MayIUse(platform::jit::avx512f)) {
-    math::VecActivations<float, platform::jit::avx512f> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  } else if (platform::jit::MayIUse(platform::jit::avx2)) {
-    math::VecActivations<float, platform::jit::avx2> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  } else if (platform::jit::MayIUse(platform::jit::avx)) {
-    math::VecActivations<float, platform::jit::avx> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-    //   ComputeCtHt = [&](float*gates,const float*ct_1,float*ct, float*ht) {
-    // // gates: W_ch, W_ih, W_fh, W_oh
-    // act_gate(d3_, gates + d_, gates + d_);
-
-    // /* C_t = C_t-1 * fgated + cand_gated * igated */
-    // act_cand(d_, gates, gates);
-    // blas.VMUL(d_, gates, gates + d_, gates + d_);
-    // blas.VMUL(d_, ct_1, gates + d2_, gates + d2_);
-    // blas.VADD(d_, gates + d_, gates + d2_, ct);
-
-    // /* H_t = act_cell(C_t) * ogated */
-    // act_cell(d_, ct, gates + d2_);
-    // blas.VMUL(d_, gates + d2_, gates + d3_, ht)
-    // GET_Ct(ct_1, gates, ct);
-    // GET_Ht(ct, gates, ht);
-    //   };
-  } else {
-    math::VecActivations<float, platform::jit::isa_any> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  }
-}
-
 template <>
 const std::shared_ptr<LSTMKernel<float>>
 KernelPool::Get<LSTMKernel<float>, int, const std::string&, const std::string&,
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 6005ea76f41..3849d29040b 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -87,5 +87,3 @@ class LSTMKernel : public Kernel {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
new file mode 100644
index 00000000000..29394e31893
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+#define SEARCH_BLOCK(src, t, isa)                             \
+  if (d < AVX_FLOAT_BLOCK) {                                  \
+    Compute = src<t, isa, kLT8>;                              \
+  } else if (d == AVX_FLOAT_BLOCK) {                          \
+    Compute = src<t, isa, kEQ8>;                              \
+  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
+    Compute = src<t, isa, kGT8LT16>;                          \
+  } else if (d == AVX512_FLOAT_BLOCK) {                       \
+    Compute = src<t, isa, kEQ16>;                             \
+  } else {                                                    \
+    Compute = src<t, isa, kGT16>;                             \
+  }
+
+#define SEARCH_ISA_BLOCK(src, t)        \
+  if (jit::MayIUse(jit::avx512f)) {     \
+    SEARCH_BLOCK(src, t, jit::avx512f); \
+  } else if (jit::MayIUse(jit::avx2)) { \
+    SEARCH_BLOCK(src, t, jit::avx2);    \
+  } else if (jit::MayIUse(jit::avx)) {  \
+    SEARCH_BLOCK(src, t, jit::avx);     \
+  } else {                              \
+    SEARCH_BLOCK(src, t, jit::isa_any); \
+  }
+
+// do not include lt8, eq8, eq16
+#define FOR_EACH_COMMON_BLOCK(macro_, isa) \
+  macro_(isa, kGT8LT16) macro_(isa, kGT16)
+
+#define FOR_EACH_ISA_COMMON_BLOCK(macro_)     \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx512f) \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx2)    \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx)     \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::any)
+
+#define FOR_EACH_ALL_BLOCK(macro_, isa)                                        \
+  macro_(isa, kLT8) macro_(isa, kEQ8) macro_(isa, kGT8LT16) macro_(isa, kEQ16) \
+      macro_(isa, kGT16)
+
+#define FOR_EACH_ISA_ALL_BLOCK(macro_)     \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx512f) \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx2)    \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx)     \
+  FOR_EACH_ALL_BLOCK(macro_, jit::any)
+
+/* VMUL JitKernel */
+#define VMUL_ANY                \
+  for (int i = 0; i < n; ++i) { \
+    z[i] = x[i] * y[i];         \
+  }
+
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+static void VMulCompute(const int n, const T* x, const T* y, T* z) {
+  VMUL_ANY
+}
+
+#ifdef PADDLE_USE_MKLML
+#define VMUL_MKL_FLOAT(isa, block)                                 \
+  template <>                                                      \
+  void VMulCompute<float, isa, block>(const int n, const float* x, \
+                                      const float* y, float* z) {  \
+    platform::dynload::vsMul(n, x, y, z);                          \
+  }
+
+#define VMUL_MKL_DOUBLE(isa, block)                                  \
+  template <>                                                        \
+  void VMulCompute<double, isa, block>(const int n, const double* x, \
+                                       const double* y, float* z) {  \
+    platform::dynload::vdMul(n, x, y, z);                            \
+  }
+
+FOR_EACH_ISA_COMMON_BLOCK(VMUL_MKL_FLOAT)
+FOR_EACH_ISA_ALL_BLOCK(VMUL_MKL_DOUBLE)
+#endif
+
+/// lt8
+#ifdef PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx, kLT8)
+#endif
+
+/// eq8
+#define VMUL_INTRI8_FLOAT(isa)                                    \
+  template <>                                                     \
+  void VMulCompute<float, isa, kEQ8>(const int n, const float* x, \
+                                     const float* y, float* z) {  \
+    __m256 tmpx, tmpy;                                            \
+    tmpx = _mm256_loadu_ps(x);                                    \
+    tmpy = _mm256_loadu_ps(y);                                    \
+    tmpx = _mm256_mul_ps(tmpx, tmpy);                             \
+    _mm256_storeu_ps(z, tmpx);                                    \
+  }
+
+// mkl > avx > for, ">" means better
+#ifdef PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx, kEQ8)
+#elif defined __AVX__
+VMUL_INTRI8_FLOAT(jit::avx)
+#endif
+// avx2 > mkl > for
+#ifdef __AVX2__
+VMUL_INTRI8_FLOAT(jit::avx2)
+#elif defined PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx2, kEQ8)
+#endif
+// TODO(TJ): test and complete avx512
+
+/// eq16
+#ifdef PADDLE_USE_MKLML
+// TODO(TJ): test and complete me
+VMUL_MKL_FLOAT(jit::avx, kEQ16)
+VMUL_MKL_FLOAT(jit::avx2, kEQ16)
+VMUL_MKL_FLOAT(jit::avx512f, kEQ16)
+#endif
+
+#define USE_VMUL_KERNEL(T, func)     \
+  template <>                        \
+  VMulKernel<T>::VMulKernel(int d) { \
+    SEARCH_ISA_BLOCK(func, T);       \
+  }
+
+USE_VMUL_KERNEL(float, VMulCompute);
+USE_VMUL_KERNEL(double, VMulCompute);
+
+#undef VMUL_ANY
+#undef VMUL_INTRI8_FLOAT
+#undef VMUL_MKL_FLOAT
+#undef VMUL_MKL_DOUBLE
+#undef USE_VMUL_KERNEL
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
deleted file mode 100644
index 46fef31ff03..00000000000
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <map>
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_lstm.cc
new file mode 100644
index 00000000000..895784a4fa6
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <functional>
+#include <string>
+#include "paddle/fluid/operators/math/cpu_vec.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+template <>
+LSTMKernel<float>::LSTMKernel(int d, const std::string& act_gate_str,
+                              const std::string& act_cand_str,
+                              const std::string& act_cell_str)
+    : Kernel(), d_(d) {
+  d2_ = d * 2;
+  d3_ = d * 3;
+  if (platform::jit::MayIUse(platform::jit::avx512f)) {
+    math::VecActivations<float, platform::jit::avx512f> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  } else if (platform::jit::MayIUse(platform::jit::avx2)) {
+    math::VecActivations<float, platform::jit::avx2> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  } else if (platform::jit::MayIUse(platform::jit::avx)) {
+    math::VecActivations<float, platform::jit::avx> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+    //   ComputeCtHt = [&](float*gates,const float*ct_1,float*ct, float*ht) {
+    // // gates: W_ch, W_ih, W_fh, W_oh
+    // act_gate(d3_, gates + d_, gates + d_);
+
+    // /* C_t = C_t-1 * fgated + cand_gated * igated */
+    // act_cand(d_, gates, gates);
+    // blas.VMUL(d_, gates, gates + d_, gates + d_);
+    // blas.VMUL(d_, ct_1, gates + d2_, gates + d2_);
+    // blas.VADD(d_, gates + d_, gates + d2_, ct);
+
+    // /* H_t = act_cell(C_t) * ogated */
+    // act_cell(d_, ct, gates + d2_);
+    // blas.VMUL(d_, gates + d2_, gates + d3_, ht)
+    // GET_Ct(ct_1, gates, ct);
+    // GET_Ht(ct, gates, ht);
+    //   };
+  } else {
+    math::VecActivations<float, platform::jit::isa_any> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  }
+}
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
-- 
GitLab