diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt index e05f204b1eebd03c7a00157d96d0482f4a44a7fb..dd039d29152961210958470a48f086a133ab640c 100644 --- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix) USE_JITKERNEL_MORE(kGRUH1, mix) USE_JITKERNEL_MORE(kGRUHtPart1, mix) USE_JITKERNEL_MORE(kGRUHtPart2, mix) +USE_JITKERNEL_MORE(kSoftmax, mix) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index df0a85256b1f546d5f64be73925cf58b87a25bd7..2a75eb23cdffb6f24ed4eaf3b38f13855ebeadcf 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -48,6 +48,27 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } +void Softmax(const T* x, T* y, int n, int bs) { + auto compute_hmax = Get, platform::CPUPlace>(n); + auto compute_hsum = Get, platform::CPUPlace>(n); + auto compute_vscal = Get, platform::CPUPlace>(n); + auto compute_vaddbias = Get, platform::CPUPlace>(n); + auto compute_vexp = + Get, platform::CPUPlace>(n); + for (int i = 0; i < bs; ++i) { + T scalar; + compute_hmax(x, &scalar, n); + scalar = static_cast(0) - scalar; + compute_vaddbias(&scalar, x, y, n); // x - max + compute_vexp(y, y, n); + compute_hsum(y, &scalar, n); + scalar = static_cast(1) / scalar; + compute_vscal(&scalar, y, y, n); + x += n; + y += n; + } +} + void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { return Get, platform::CPUPlace>(d); @@ -184,6 +205,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; } bool VTanhKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::UseMe(const int& d) const { return true; } + bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } @@ -207,6 +230,7 @@ namespace mix = paddle::operators::jit::more::mix; REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); REGISTER_MORE_KERNEL(kVTanh, VTanh); +REGISTER_MORE_KERNEL(kSoftmax, Softmax); REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_MORE_KERNEL(kGRUH1, GRUH1); diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index a70ecdf9348f511311307b4c27bb4506222a7439..d64af192197a0b339a39a1862c028875da2f3900 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -26,6 +26,7 @@ using T = float; void VSigmoid(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n); +void Softmax(const T* x, T* y, int n, int bs); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); @@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr); DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); DECLARE_MORE_KERNEL(VTanh, XYNTuples); +// XRN +DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); + DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 667c6dfad6676d00ab994564bff57c90caa0cb41..f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl) +USE_JITKERNEL_MORE(kSoftmax, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index fccdc68f5efa34bac6f5a34a41569d2f77416284..b13b8638e28b48208de5bc0058a20a009f6cf0b0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -116,6 +116,16 @@ void VAXPY(double a, const double* x, double* y, int n) { platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); } +template <> +void ASum(const float* x, float* res, int n) { + res[0] = platform::dynload::cblas_sasum(n, x, 1); +} + +template <> +void ASum(const double* x, double* res, int n) { + res[0] = platform::dynload::cblas_dasum(n, x, 1); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool MatMulKernel::UseMe(const int& d) const { @@ -167,6 +177,11 @@ bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { return true; } +template <> +bool SoftmaxKernel::UseMe(const int& d) const { + return true; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(const int& d) const { \ @@ -181,6 +196,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VSquare); +AWALYS_USE_ME_WITH_DOUBLE(Softmax); #undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl @@ -204,5 +220,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kSeqPool, SeqPool); +REGISTER_MKL_KERNEL(kSoftmax, Softmax); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index a27196fa19f1d3e9aa6c414b6b9f99a21ef49025..6b95b9c872dc12cccaef0b0737edd760447a47d0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/fluid/operators/jit/kernel_base.h" namespace paddle { @@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { } } +template +void ASum(const T* x, T* res, int n); + +template +void Softmax(const T* x, T* y, int n, int bs) { + std::vector entities(bs); + for (int i = 0; i < bs; ++i) { + entities[i] = x[i * n]; + for (int c = 1; c < n; ++c) { + entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; + } + for (int c = 0; c < n; ++c) { + y[i * n + c] = x[i * n + c] - entities[i]; + } + } + VExp(y, y, n * bs); + for (int i = 0; i < bs; ++i) { + T sum; + ASum(&y[i * n], &sum, n); + sum = static_cast(1) / sum; + VScal(&sum, &y[i * n], &y[i * n], n); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelMore> { \ @@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples); DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); +DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl