add softmax mix and mkl code

test=develop

add softmax mix and mkl code
test=develop
7383eefd · tensor-tang · 50945685 · 7383eefd · 7383eefd · 7383eefd
6 changed file
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix)
 USE_JITKERNEL_MORE(kGRUH1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE(kSoftmax, mix)
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -48,6 +48,27 @@ void VTanh(const T* x, T* y, int n) {
  compute_addbias(&b, y, y, n);
 }

+void Softmax(const T* x, T* y, int n, int bs) {
+  auto compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
+  auto compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
+  auto compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+  auto compute_vexp =
+      Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    compute_hmax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    compute_vaddbias(&scalar, x, y, n);  // x - max
+    compute_vexp(y, y, n);
+    compute_hsum(y, &scalar, n);
+    scalar = static_cast<T>(1) / scalar;
+    compute_vscal(&scalar, y, y, n);
+    x += n;
+    y += n;
+  }
+}
+
 void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
  if (type == kVSigmoid) {
    return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
@@ -184,6 +205,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; }

 bool VTanhKernel::UseMe(const int& d) const { return true; }

+bool SoftmaxKernel::UseMe(const int& d) const { return true; }
+
 bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }

 bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
@@ -207,6 +230,7 @@ namespace mix = paddle::operators::jit::more::mix;

 REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MORE_KERNEL(kVTanh, VTanh);
+REGISTER_MORE_KERNEL(kSoftmax, Softmax);
 REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
 REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
 REGISTER_MORE_KERNEL(kGRUH1, GRUH1);

--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -26,6 +26,7 @@ using T = float;

 void VSigmoid(const T* x, T* y, int n);
 void VTanh(const T* x, T* y, int n);
+void Softmax(const T* x, T* y, int n, int bs);

 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
 void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
@@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
 DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MORE_KERNEL(VTanh, XYNTuples);

+// XRN
+DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
+
 DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
 DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);


--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
+USE_JITKERNEL_MORE(kSoftmax, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -116,6 +116,16 @@ void VAXPY<double>(double a, const double* x, double* y, int n) {
  platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
 }

+template <>
+void ASum<float>(const float* x, float* res, int n) {
+  res[0] = platform::dynload::cblas_sasum(n, x, 1);
+}
+
+template <>
+void ASum<double>(const double* x, double* res, int n) {
+  res[0] = platform::dynload::cblas_dasum(n, x, 1);
+}
+
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool MatMulKernel<float>::UseMe(const int& d) const {
@@ -167,6 +177,11 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
  return true;
 }

+template <>
+bool SoftmaxKernel<float>::UseMe(const int& d) const {
+  return true;
+}
+
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
  template <>                                            \
  bool func##Kernel<double>::UseMe(const int& d) const { \
@@ -181,6 +196,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 AWALYS_USE_ME_WITH_DOUBLE(VSquare);
+AWALYS_USE_ME_WITH_DOUBLE(Softmax);

 #undef AWALYS_USE_ME_WITH_DOUBLE
 }  // namespace mkl
@@ -204,5 +220,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kSoftmax, Softmax);

 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -16,6 +16,7 @@

 #include <cmath>
 #include <type_traits>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"

 namespace paddle {
@@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
  }
 }

+template <typename T>
+void ASum(const T* x, T* res, int n);
+
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs) {
+  std::vector<T> entities(bs);
+  for (int i = 0; i < bs; ++i) {
+    entities[i] = x[i * n];
+    for (int c = 1; c < n; ++c) {
+      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+    }
+    for (int c = 0; c < n; ++c) {
+      y[i * n + c] = x[i * n + c] - entities[i];
+    }
+  }
+  VExp(y, y, n * bs);
+  for (int i = 0; i < bs; ++i) {
+    T sum;
+    ASum(&y[i * n], &sum, n);
+    sum = static_cast<T>(1) / sum;
+    VScal(&sum, &y[i * n], &y[i * n], n);
+  }
+}
+
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
  template <typename T>                                              \
  class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);

 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);

+DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
+
 #undef DECLARE_MKL_KERNEL

 }  // namespace mkl