enable jitkernel mkl vexp, vsigmoid and vtanh

5e97be7b · tensor-tang · ae179269 · 5e97be7b · 5e97be7b · 5e97be7b
9 changed file
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -56,10 +56,6 @@ typedef enum {
  identity
 } operand_type;
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
 #define DECLARE_JIT_CODE(codename) \
  const char* name() const override { return #codename; }

--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -27,10 +27,6 @@ namespace paddle {
 namespace operators {
 namespace jit {
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
 template <KernelType KT, typename KernelTuples, typename PlaceType>
 inline typename std::enable_if<
    std::is_same<typename KernelTuples::data_type, float>::value &&

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -13,6 +13,7 @@
 * limitations under the License. */
 #pragma once
+#include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
 namespace paddle {

--- a/paddle/fluid/operators/jit/macro.h
+++ b/paddle/fluid/operators/jit/macro.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#pragma once
+#include <type_traits>
+namespace paddle {
+namespace operators {
+namespace jit {
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -6,3 +6,6 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE
 USE_JITKERNEL_MORE(vmul, mkl)
 USE_JITKERNEL_MORE(vadd, mkl)
 USE_JITKERNEL_MORE(vscal, mkl)
+USE_JITKERNEL_MORE(vexp, mkl)
+USE_JITKERNEL_MORE(vsigmoid, mkl)
+USE_JITKERNEL_MORE(vtanh, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -62,6 +62,16 @@ void VScal<double>(const double* a, const double* x, double* y, int n) {
  }
 }
+template <>
+void VExp<float>(const float* x, float* y, int n) {
+  platform::dynload::vsExp(n, x, y);
+}
+template <>
+void VExp<double>(const double* x, double* y, int n) {
+  platform::dynload::vdExp(n, x, y);
+}
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool VMulKernel<float>::UseMe(int d) const {
@@ -78,6 +88,21 @@ bool VScalKernel<float>::UseMe(int d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }
+template <>
+bool VExpKernel<float>::UseMe(int d) const {
+  return d > 7;
+}
+template <>
+bool VSigmoidKernel<float>::UseMe(int d) const {
+  return d > 7;
+}
+template <>
+bool VTanhKernel<float>::UseMe(int d) const {
+  return d > 7;
+}
 #define AWALYS_USE_ME_WITH_DOUBLE(func)           \
  template <>                                     \
  bool func##Kernel<double>::UseMe(int d) const { \
@@ -87,6 +112,9 @@ bool VScalKernel<float>::UseMe(int d) const {
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
+AWALYS_USE_ME_WITH_DOUBLE(VExp);
+AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
+AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 #undef AWALYS_USE_ME_WITH_DOUBLE
 }  // namespace mkl
@@ -104,5 +132,8 @@ namespace mkl = paddle::operators::jit::more::mkl;
 REGISTER_MKL_KERNEL(vmul, VMul);
 REGISTER_MKL_KERNEL(vadd, VAdd);
 REGISTER_MKL_KERNEL(vscal, VScal);
+REGISTER_MKL_KERNEL(vexp, VExp);
+REGISTER_MKL_KERNEL(vsigmoid, VSigmoid);
+REGISTER_MKL_KERNEL(vtanh, VTanh);
 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -32,6 +32,34 @@ void VAdd(const T* x, const T* y, T* z, int n);
 template <typename T>
 void VScal(const T* a, const T* x, T* y, int n);
+template <typename T>
+void VExp(const T* x, T* y, int n);
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(0) - y[i];
+  }
+  VExp(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
+  }
+}
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
 #define DECLARE_MKL_KERNEL(name, tuples)                      \
  template <typename T>                                       \
  class name##Kernel : public KernelImpl<tuples<T>> {         \
@@ -47,6 +75,11 @@ DECLARE_MKL_KERNEL(VAdd, XYZNTuples);
 // AXYN
 DECLARE_MKL_KERNEL(VScal, AXYNTuples);
+// XYN
+DECLARE_MKL_KERNEL(VExp, XYNTuples);
+DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
+DECLARE_MKL_KERNEL(VTanh, XYNTuples);
 #undef DECLARE_MKL_KERNEL
 }  // namespace mkl

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -312,7 +312,7 @@ void TestXYNKernel() {
    std::vector<T> x(d), yref(d);
    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
+    RandomVec<T>(d, x.data(), -2.f, 2.f);
    std::copy(x.begin(), x.end(), xinp.begin());
    const T* x_data = x.data();

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -30,47 +30,6 @@ namespace operators {
 namespace math {
 namespace jitkernel {
-#ifdef PADDLE_WITH_MKLML
-// try to use MKL to speedup
-template <typename T>
-void VExpMKL(const T* x, T* y, int n);
-template <>
-void VExpMKL<float>(const float* x, float* y, int n) {
-  platform::dynload::vsExp(n, x, y);
-}
-template <>
-void VExpMKL<double>(const double* x, double* y, int n) {
-  platform::dynload::vdExp(n, x, y);
-}
-template <typename T>
-void VSigmoidMKL(const T* x, T* y, int n) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  VExpMKL(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-template <typename T>
-void VTanhMKL(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoidMKL(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-#endif
 /* VExp JitKernel */
 template <typename T>
 class VExpKernelImpl : public VExpKernel<T> {