clean and refine kernels

eeff268a · tensor-tang · dee5d35c · eeff268a · eeff268a · eeff268a
6 changed file
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -76,5 +76,5 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel SRCS jit_kernel.cc DEPS cpu_info cblas)
+cc_library(jit_kernel SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_lstm.cc DEPS cpu_info cblas)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -13,17 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <functional>
 #include <string>
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 namespace paddle {
 namespace operators {
@@ -36,115 +26,6 @@ KernelPool& KernelPool::Instance() {
  static KernelPool g_jit_kernels;
  return g_jit_kernels;
 }
-#define SEARCH_BLOCK(src, t, isa)                             \
-  if (d < AVX_FLOAT_BLOCK) {                                  \
-    Compute = src<t, isa, kLT8>;                              \
-  } else if (d == AVX_FLOAT_BLOCK) {                          \
-    Compute = src<t, isa, kEQ8>;                              \
-  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
-    Compute = src<t, isa, kGT8LT16>;                          \
-  } else if (d == AVX512_FLOAT_BLOCK) {                       \
-    Compute = src<t, isa, kEQ16>;                             \
-  } else {                                                    \
-    Compute = src<t, isa, kGT16>;                             \
-  }
-#define SEARCH_ISA_BLOCK(src, t)        \
-  if (jit::MayIUse(jit::avx512f)) {     \
-    SEARCH_BLOCK(src, t, jit::avx512f); \
-  } else if (jit::MayIUse(jit::avx2)) { \
-    SEARCH_BLOCK(src, t, jit::avx2);    \
-  } else if (jit::MayIUse(jit::avx)) {  \
-    SEARCH_BLOCK(src, t, jit::avx);     \
-  } else {                              \
-    SEARCH_BLOCK(src, t, jit::isa_any); \
-  }
-// do not include lt8, eq8, eq16
-#define FOR_EACH_COMMON_BLOCK(macro_, isa) \
-  macro_(isa, kGT8LT16) macro_(isa, kGT16)
-#define FOR_EACH_ISA_COMMON_BLOCK(macro_) \
-  FOR_EACH_BLOCK(macro_, jit::avx512f)    \
-  FOR_EACH_BLOCK(macro_, jit::avx2)       \
-  FOR_EACH_BLOCK(macro_, jit::avx)        \
-  FOR_EACH_BLOCK(macro_, jit::any)
-#define VMUL_ANY                \
-  for (int i = 0; i < n; ++i) { \
-    z[i] = x[i] * y[i];         \
-  }
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
-static void VMulCompute(const int n, const T* x, const T* y, T* z) {
-  VMUL_ANY
-}
-#ifdef PADDLE_USE_MKLML
-#define DEFINE_VMUL_COMPUTE_FLOAT(isa, block)                      \
-  template <>                                                      \
-  void VMulCompute<float, isa, block>(const int n, const float* x, \
-                                      const float* y, float* z) {  \
-    platform::dynload::vsMul(n, x, y, z);                          \
-  }
-#define DEFINE_VMUL_COMPUTE_DOUBLE(isa, block)                       \
-  template <>                                                        \
-  void VMulCompute<double, isa, block>(const int n, const double* x, \
-                                       const double* y, float* z) {  \
-    platform::dynload::vdMul(n, x, y, z);                            \
-  }
-FOR_EACH_ISA_COMMON_BLOCK(DEFINE_VMUL_COMPUTE_FLOAT)
-FOR_EACH_ISA_COMMON_BLOCK(DEFINE_VMUL_COMPUTE_DOUBLE)
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kLT8)
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kEQ16)
-#endif
-// mkl > avx > for, ">" means better
-#ifdef PADDLE_USE_MKLML
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx, kEQ8)
-#elif defined __AVX__
-template <>
-void VMulCompute<float, jit::avx, kEQ8>(const int n, const float* x,
-                                        const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_mul_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#endif
-// avx2 > mkl > for
-#ifdef __AVX2__
-template <>
-void VMulCompute<float, jit::avx2, kEQ8>(const int n, const float* x,
-                                         const float* y, float* z) {
-  __m256 tmpx, tmpy;
-  tmpx = _mm256_loadu_ps(x);
-  tmpy = _mm256_loadu_ps(y);
-  tmpx = _mm256_mul_ps(tmpx, tmpy);
-  _mm256_storeu_ps(z, tmpx);
-}
-#elif defined PADDLE_USE_MKLML
-DEFINE_VMUL_COMPUTE_FLOAT(jit::avx2, kEQ8)
-#endif
-// TODO(TJ): test and complete avx512
-#undef DEFINE_VMUL_COMPUTE_FLOAT
-#undef DEFINE_VMUL_COMPUTE_DOUBLE
-#undef VMUL_ANY
-template <>
-VMulKernel<float>::VMulKernel(int d) {
-  SEARCH_ISA_BLOCK(VMulCompute, float);
-}
-template <>
-VMulKernel<double>::VMulKernel(int d) {
-  SEARCH_ISA_BLOCK(VMulCompute, double);
-}
 template <>
 const std::shared_ptr<VMulKernel<float>> KernelPool::Get<VMulKernel<float>>(
@@ -170,52 +51,6 @@ const std::shared_ptr<VMulKernel<double>> KernelPool::Get<VMulKernel<double>>(
  return std::dynamic_pointer_cast<VMulKernel<double>>(kers_.at(key));
 }
-template <>
-LSTMKernel<float>::LSTMKernel(int d, const std::string& act_gate_str,
-                              const std::string& act_cand_str,
-                              const std::string& act_cell_str)
-    : Kernel(), d_(d) {
-  d2_ = d * 2;
-  d3_ = d * 3;
-  if (platform::jit::MayIUse(platform::jit::avx512f)) {
-    math::VecActivations<float, platform::jit::avx512f> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  } else if (platform::jit::MayIUse(platform::jit::avx2)) {
-    math::VecActivations<float, platform::jit::avx2> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  } else if (platform::jit::MayIUse(platform::jit::avx)) {
-    math::VecActivations<float, platform::jit::avx> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-    //   ComputeCtHt = [&](float*gates,const float*ct_1,float*ct, float*ht) {
-    // // gates: W_ch, W_ih, W_fh, W_oh
-    // act_gate(d3_, gates + d_, gates + d_);
-    // /* C_t = C_t-1 * fgated + cand_gated * igated */
-    // act_cand(d_, gates, gates);
-    // blas.VMUL(d_, gates, gates + d_, gates + d_);
-    // blas.VMUL(d_, ct_1, gates + d2_, gates + d2_);
-    // blas.VADD(d_, gates + d_, gates + d2_, ct);
-    // /* H_t = act_cell(C_t) * ogated */
-    // act_cell(d_, ct, gates + d2_);
-    // blas.VMUL(d_, gates + d2_, gates + d3_, ht)
-    // GET_Ct(ct_1, gates, ct);
-    // GET_Ht(ct, gates, ht);
-    //   };
-  } else {
-    math::VecActivations<float, platform::jit::isa_any> act_functor;
-    act_gate_ = act_functor(act_gate_str);
-    act_cell_ = act_functor(act_cell_str);
-    act_cand_ = act_functor(act_cand_str);
-  }
-}
 template <>
 const std::shared_ptr<LSTMKernel<float>>
 KernelPool::Get<LSTMKernel<float>, int, const std::string&, const std::string&,

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -87,5 +87,3 @@ class LSTMKernel : public Kernel {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
-#include "paddle/fluid/operators/math/jit_kernel_impl.h"
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <string>
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace jit = platform::jit;
+#define SEARCH_BLOCK(src, t, isa)                             \
+  if (d < AVX_FLOAT_BLOCK) {                                  \
+    Compute = src<t, isa, kLT8>;                              \
+  } else if (d == AVX_FLOAT_BLOCK) {                          \
+    Compute = src<t, isa, kEQ8>;                              \
+  } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
+    Compute = src<t, isa, kGT8LT16>;                          \
+  } else if (d == AVX512_FLOAT_BLOCK) {                       \
+    Compute = src<t, isa, kEQ16>;                             \
+  } else {                                                    \
+    Compute = src<t, isa, kGT16>;                             \
+  }
+#define SEARCH_ISA_BLOCK(src, t)        \
+  if (jit::MayIUse(jit::avx512f)) {     \
+    SEARCH_BLOCK(src, t, jit::avx512f); \
+  } else if (jit::MayIUse(jit::avx2)) { \
+    SEARCH_BLOCK(src, t, jit::avx2);    \
+  } else if (jit::MayIUse(jit::avx)) {  \
+    SEARCH_BLOCK(src, t, jit::avx);     \
+  } else {                              \
+    SEARCH_BLOCK(src, t, jit::isa_any); \
+  }
+// do not include lt8, eq8, eq16
+#define FOR_EACH_COMMON_BLOCK(macro_, isa) \
+  macro_(isa, kGT8LT16) macro_(isa, kGT16)
+#define FOR_EACH_ISA_COMMON_BLOCK(macro_)     \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx512f) \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx2)    \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::avx)     \
+  FOR_EACH_COMMON_BLOCK(macro_, jit::any)
+#define FOR_EACH_ALL_BLOCK(macro_, isa)                                        \
+  macro_(isa, kLT8) macro_(isa, kEQ8) macro_(isa, kGT8LT16) macro_(isa, kEQ16) \
+      macro_(isa, kGT16)
+#define FOR_EACH_ISA_ALL_BLOCK(macro_)     \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx512f) \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx2)    \
+  FOR_EACH_ALL_BLOCK(macro_, jit::avx)     \
+  FOR_EACH_ALL_BLOCK(macro_, jit::any)
+/* VMUL JitKernel */
+#define VMUL_ANY                \
+  for (int i = 0; i < n; ++i) { \
+    z[i] = x[i] * y[i];         \
+  }
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+static void VMulCompute(const int n, const T* x, const T* y, T* z) {
+  VMUL_ANY
+}
+#ifdef PADDLE_USE_MKLML
+#define VMUL_MKL_FLOAT(isa, block)                                 \
+  template <>                                                      \
+  void VMulCompute<float, isa, block>(const int n, const float* x, \
+                                      const float* y, float* z) {  \
+    platform::dynload::vsMul(n, x, y, z);                          \
+  }
+#define VMUL_MKL_DOUBLE(isa, block)                                  \
+  template <>                                                        \
+  void VMulCompute<double, isa, block>(const int n, const double* x, \
+                                       const double* y, float* z) {  \
+    platform::dynload::vdMul(n, x, y, z);                            \
+  }
+FOR_EACH_ISA_COMMON_BLOCK(VMUL_MKL_FLOAT)
+FOR_EACH_ISA_ALL_BLOCK(VMUL_MKL_DOUBLE)
+#endif
+/// lt8
+#ifdef PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx, kLT8)
+#endif
+/// eq8
+#define VMUL_INTRI8_FLOAT(isa)                                    \
+  template <>                                                     \
+  void VMulCompute<float, isa, kEQ8>(const int n, const float* x, \
+                                     const float* y, float* z) {  \
+    __m256 tmpx, tmpy;                                            \
+    tmpx = _mm256_loadu_ps(x);                                    \
+    tmpy = _mm256_loadu_ps(y);                                    \
+    tmpx = _mm256_mul_ps(tmpx, tmpy);                             \
+    _mm256_storeu_ps(z, tmpx);                                    \
+  }
+// mkl > avx > for, ">" means better
+#ifdef PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx, kEQ8)
+#elif defined __AVX__
+VMUL_INTRI8_FLOAT(jit::avx)
+#endif
+// avx2 > mkl > for
+#ifdef __AVX2__
+VMUL_INTRI8_FLOAT(jit::avx2)
+#elif defined PADDLE_USE_MKLML
+VMUL_MKL_FLOAT(jit::avx2, kEQ8)
+#endif
+// TODO(TJ): test and complete avx512
+/// eq16
+#ifdef PADDLE_USE_MKLML
+// TODO(TJ): test and complete me
+VMUL_MKL_FLOAT(jit::avx, kEQ16)
+VMUL_MKL_FLOAT(jit::avx2, kEQ16)
+VMUL_MKL_FLOAT(jit::avx512f, kEQ16)
+#endif
+#define USE_VMUL_KERNEL(T, func)     \
+  template <>                        \
+  VMulKernel<T>::VMulKernel(int d) { \
+    SEARCH_ISA_BLOCK(func, T);       \
+  }
+USE_VMUL_KERNEL(float, VMulCompute);
+USE_VMUL_KERNEL(double, VMulCompute);
+#undef VMUL_ANY
+#undef VMUL_INTRI8_FLOAT
+#undef VMUL_MKL_FLOAT
+#undef VMUL_MKL_DOUBLE
+#undef USE_VMUL_KERNEL
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_impl.h
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <functional>
-#include <map>
-#include <string>
-#include "paddle/fluid/platform/cpu_info.h"
-namespace paddle {
-namespace operators {
-namespace math {
-namespace jitkernel {}  // namespace jitkernel
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/jit_kernel_lstm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <functional>
+#include <string>
+#include "paddle/fluid/operators/math/cpu_vec.h"
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace jit = platform::jit;
+template <>
+LSTMKernel<float>::LSTMKernel(int d, const std::string& act_gate_str,
+                              const std::string& act_cand_str,
+                              const std::string& act_cell_str)
+    : Kernel(), d_(d) {
+  d2_ = d * 2;
+  d3_ = d * 3;
+  if (platform::jit::MayIUse(platform::jit::avx512f)) {
+    math::VecActivations<float, platform::jit::avx512f> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  } else if (platform::jit::MayIUse(platform::jit::avx2)) {
+    math::VecActivations<float, platform::jit::avx2> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  } else if (platform::jit::MayIUse(platform::jit::avx)) {
+    math::VecActivations<float, platform::jit::avx> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+    //   ComputeCtHt = [&](float*gates,const float*ct_1,float*ct, float*ht) {
+    // // gates: W_ch, W_ih, W_fh, W_oh
+    // act_gate(d3_, gates + d_, gates + d_);
+    // /* C_t = C_t-1 * fgated + cand_gated * igated */
+    // act_cand(d_, gates, gates);
+    // blas.VMUL(d_, gates, gates + d_, gates + d_);
+    // blas.VMUL(d_, ct_1, gates + d2_, gates + d2_);
+    // blas.VADD(d_, gates + d_, gates + d2_, ct);
+    // /* H_t = act_cell(C_t) * ogated */
+    // act_cell(d_, ct, gates + d2_);
+    // blas.VMUL(d_, gates + d2_, gates + d3_, ht)
+    // GET_Ct(ct_1, gates, ct);
+    // GET_Ht(ct, gates, ht);
+    //   };
+  } else {
+    math::VecActivations<float, platform::jit::isa_any> act_functor;
+    act_gate_ = act_functor(act_gate_str);
+    act_cell_ = act_functor(act_cell_str);
+    act_cand_ = act_functor(act_cand_str);
+  }
+}
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle