From 07473786084b6941a812d5d3f14772bd82ae7144 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 29 Mar 2022 17:56:57 +0800 Subject: [PATCH] Revert "Revert "Move some activation to phi (#40727)" (#41056)" This reverts commit 05f3d48ed3b2f7b5eb0a135a4e07aa160cfa5d6d. --- .../new_executor/standalone_executor_test.cc | 3 +- paddle/fluid/operators/activation_op.cc | 86 +-- paddle/fluid/operators/activation_op.h | 484 +++----------- paddle/fluid/operators/activation_op.kps | 371 +---------- paddle/fluid/operators/math/CMakeLists.txt | 4 +- .../operators/math/selected_rows_functor.cc | 1 + paddle/fluid/operators/temporal_shift_op.h | 90 +-- paddle/phi/kernels/activation_kernel.h | 22 + .../phi/kernels/cpu/activation_grad_kernel.cc | 54 ++ paddle/phi/kernels/cpu/activation_kernel.cc | 32 +- .../kernels/cpu/temporal_shift_grad_kernel.cc | 136 ++++ .../phi/kernels/cpu/temporal_shift_kernel.cc | 131 ++++ paddle/phi/kernels/funcs/activation_functor.h | 589 ++++++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 65 ++ paddle/phi/kernels/gpu/activation_kernel.cu | 54 ++ .../kernels/gpu/temporal_shift_grad_kernel.cu | 149 +++++ .../phi/kernels/gpu/temporal_shift_kernel.cu | 148 +++++ .../phi/kernels/impl/activation_grad_impl.h | 18 + paddle/phi/kernels/impl/activation_impl.h | 16 + .../selected_rows/activation_kernel.cc | 68 ++ .../kernels/selected_rows/activation_kernel.h | 34 + .../phi/kernels/temporal_shift_grad_kernel.h | 29 + paddle/phi/kernels/temporal_shift_kernel.h | 29 + paddle/phi/ops/compat/activation_sig.cc | 82 ++- paddle/phi/ops/compat/temporal_shift_sig.cc | 39 ++ .../unittests/test_activation_nn_grad.py | 1 + .../unittests/test_activation_sparse_op.py | 101 +++ .../tests/unittests/test_clip_by_norm_op.py | 2 + .../tests/unittests/test_temporal_shift_op.py | 1 + 29 files changed, 1908 insertions(+), 931 deletions(-) create mode 100644 paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/temporal_shift_kernel.cc create mode 100644 paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/temporal_shift_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/activation_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/activation_kernel.h create mode 100644 paddle/phi/kernels/temporal_shift_grad_kernel.h create mode 100644 paddle/phi/kernels/temporal_shift_kernel.h create mode 100644 paddle/phi/ops/compat/temporal_shift_sig.cc create mode 100644 python/paddle/fluid/tests/unittests/test_activation_sparse_op.py diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index d3adccff73..8a3b40bbd7 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -53,7 +53,7 @@ USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP_ITSELF(slice_grad); USE_OP_ITSELF(lookup_table_grad); -USE_OP(sqrt); +USE_OP_ITSELF(sqrt); USE_OP_ITSELF(elementwise_max); USE_OP_ITSELF(elementwise_div); USE_OP_ITSELF(sgd); @@ -83,6 +83,7 @@ PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT); DECLARE_double(eager_delete_tensor_gb); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 632ea92746..6be872b028 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1496,6 +1496,14 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); +REGISTER_ACTIVATION_OP(expm1, Expm1, Expm1Functor, Expm1GradFunctor); +REGISTER_ACTIVATION_OP(softplus, Softplus, SoftplusFunctor, + SoftplusGradFunctor); +REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor); +REGISTER_ACTIVATION_OP(stanh, STanh, STanhFunctor, STanhGradFunctor); +REGISTER_ACTIVATION_OP(reciprocal, Reciprocal, ReciprocalFunctor, + ReciprocalGradFunctor); + REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor); REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor); REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); @@ -1630,12 +1638,7 @@ REGISTER_OPERATOR(logit, ops::LogitOp, ops::LogitOpMaker, ops::LogitGradOpMaker, ops::LogitGradOpMaker); REGISTER_OPERATOR(logit_grad, ops::LogitGradOp); -REGISTER_OP_CPU_KERNEL( - logit, ops::LogitKernel, - ops::LogitKernel); -REGISTER_OP_CPU_KERNEL( - logit_grad, ops::LogitGradKernel, - ops::LogitGradKernel); + /* ========================================================================== */ /* ======================== celu register ============================ @@ -1684,7 +1687,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); REGISTER_OP_CPU_KERNEL( sqrt_grad_grad, ops::SqrtDoubleGradKernel>, @@ -1712,7 +1714,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor); REGISTER_OP_CPU_KERNEL( rsqrt_grad_grad, ops::RsqrtDoubleGradKernel::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL(square, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - square_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); - REGISTER_OP_CPU_KERNEL( square_grad_grad, ops::SquareDoubleGradKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - exp_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); -/* ========================================================================== */ - -/* ========================== expm1 register ============================ */ -REGISTER_OPERATOR( - expm1, ops::ActivationOp, ops::Expm1OpMaker, ops::ActivationOpInferVarType, - ops::ActivationGradOpMaker::FwdDeps(), - paddle::framework::OpDesc>, - ops::ActivationGradOpMaker::FwdDeps(), - paddle::imperative::OpBase>, - std::conditional>(), - ops::ActFwdInplaceInferer, void>::type); -REGISTER_OPERATOR(expm1_grad, ops::ActivationOpGrad, - ops::ActivationGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL(expm1, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - expm1_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); -/* ========================================================================== */ - /* ========================== Log register ==================================*/ REGISTER_OPERATOR( log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType, @@ -1864,8 +1798,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -/* ========================================================================== */ - /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(leaky_relu) .AddCheckpoint( diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 5a72f2086c..5448ed2a4b 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -264,6 +264,7 @@ USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Atanh) USE_PHI_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(Exp) USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) USE_PHI_FUNCTOR(BRelu) @@ -289,6 +290,15 @@ USE_PHI_FUNCTOR(Log1p) USE_PHI_FUNCTOR(Swish) USE_PHI_FUNCTOR(HardSwish) USE_PHI_FUNCTOR(Pow) +USE_PHI_FUNCTOR(Exp) +USE_PHI_FUNCTOR(Expm1) +USE_PHI_FUNCTOR(Mish) +USE_PHI_FUNCTOR(STanh) +USE_PHI_FUNCTOR(Reciprocal) +USE_PHI_FUNCTOR(Square) +USE_PHI_FUNCTOR(Sqrt) +USE_PHI_FUNCTOR(Rsqrt) +USE_PHI_FUNCTOR(Softplus) template using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; @@ -305,49 +315,8 @@ using CeilFunctor = phi::funcs::CeilFunctor; template using ZeroGradFunctor = phi::funcs::ZeroGradFunctor; -// exp(x) = e^x template -struct ExpFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.exp(); - } -}; - -template -struct ExpGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// expm1(x) = e^x - 1 -template -struct Expm1Functor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.expm1(); - } -}; - -template -struct Expm1GradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out + dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; +using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; // relu(x) = max(x, 0) @@ -362,92 +331,68 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// sqrt(x) = x^(1/2) -template -struct SqrtFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.sqrt(); - } -}; - -template -struct SqrtGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = static_cast(0.5) * dout / out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// rsqrt(x) = x^(-1/2) template -struct RsqrtFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.rsqrt(); - } -}; - -template -struct RsqrtGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = static_cast(-0.5) * dout * out * out * out; +struct SqrtGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, framework::Tensor* ddOut, + framework::Tensor* dOut, const framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad")); + // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx + // calculate dy first, so ddy can inplace ddx + if (dOut) { + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad")); + dout.device(*d) = dx * ddx * static_cast(-1) / out; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad")); + ddout.device(*d) = ddx * static_cast(0.5) / out; + } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; -// reciprocal(x) = 1 / x template -struct ReciprocalFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = static_cast(1) / x; - } -}; +struct RsqrtGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, framework::Tensor* ddOut, + framework::Tensor* dOut, const framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad")); -template -struct ReciprocalGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(-1) * out * out; + // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx + if (dOut) { + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad")); + dout.device(*d) = (static_cast(3.0) / out) * dx * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad")); + ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; + } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepOut; } }; -// square(x) = x^2 -template -struct SquareFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.square(); - } -}; - -template -struct SquareGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(2) * x; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // relu6(x) = min(max(0, x), 6) template struct Relu6Functor : public BaseActivationFunctor { @@ -484,114 +429,6 @@ struct Relu6GradFunctor : public BaseActivationFunctor { } }; -// For numerical stability, using the following formula instead of softplus(x) = -// log(1 + exp(x)) -// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta = -// 1, threshold = 20 by default), otherwise x -template -struct SoftplusFunctor : public BaseActivationFunctor { - float beta; - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"beta", &beta}, {"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) { - auto x_beta = static_cast(beta) * x; - out.device(d) = (x_beta > static_cast(threshold)) - .select(x, (static_cast(1) + x_beta.exp()).log() / - static_cast(beta)); - } -}; - -// For numerical stability, using the following formula instead of -// d(softplus(x))/dx = 1 / (1 + exp(-x)) -// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta -// = 1, threshold = 20 by default), otherwise x -template -struct SoftplusGradFunctor : public BaseActivationFunctor { - float beta; - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"beta", &beta}, {"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) { - auto x_beta = static_cast(beta) * x; - dx.device(d) = - (x_beta > static_cast(threshold)) - .select(dout, dout / (static_cast(1) + (-x_beta).exp())); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// mish(x) = x * tanh(softplus(x)) -// softplus(x) = x, if x > threshold -// = ln(1 + exp(x)), otherwise -template -struct MishFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) { - auto sp = (x > static_cast(threshold)) - .select(x, (static_cast(1) + x.exp()).log()); - out.device(d) = x * sp.tanh(); - } -}; - -// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp))) -// sp = softplus(x) -template -struct MishGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) { - auto sp = (x > static_cast(threshold)) - .select(x, (static_cast(1) + x.exp()).log()); - auto gsp = static_cast(1) - (-sp).exp(); - auto tsp = sp.tanh(); - dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// softsign(x) = x / (1 + |x|) -template -struct SoftsignFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) { - out.device(d) = x / (static_cast(1) + x.abs()); - } -}; - -// d(softsign(x))/dx = 1 / (1 + |x|)^2 -// Taken from https://en.wikipedia.org/wiki/Activation_function -template -struct SoftsignGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) { - dx.device(d) = - dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct SoftReluFunctor : public BaseActivationFunctor { float threshold; @@ -706,71 +543,6 @@ struct CELUGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct LogitFunctor { - template - void operator()(Device d, X x, Out out, P p, float eps) const { - // logit(x) = ln(x/(1-x)) - auto tmp_x = - (x.cwiseMin(static_cast(1.0 - eps))).cwiseMax(static_cast(eps)); - - if (!eps) { - out.device(d) = (x < static_cast(0.0) || x > static_cast(1.0)) - .select(p.constant(static_cast(NAN)), - (tmp_x / (static_cast(1) - tmp_x)).log()); - } else { - out.device(d) = (tmp_x / (static_cast(1) - tmp_x)).log(); - } - } -}; - -template -struct LogitGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const { - // logit(x)' = 1/(x*(1-x)) - dx.device(d) = - (x < static_cast(eps) || x > static_cast(1.0 - eps)) - .select(p.constant(static_cast(0)), - dout * (static_cast(1) / ((static_cast(1) - x) * x))); - } -}; - -template -struct STanhFunctor : public BaseActivationFunctor { - float scale_a; - float scale_b; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); - } -}; - -template -struct STanhGradFunctor : public BaseActivationFunctor { - float scale_a; - float scale_b; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto a = static_cast(scale_a); - auto b = static_cast(scale_b); - auto temp = (a * x).tanh() * (a * x).tanh(); - dx.device(d) = dout * a * b * (static_cast(1) - temp); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct AbsGradGradFunctor : public BaseActivationFunctor { template @@ -831,68 +603,6 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct SqrtGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, framework::Tensor* ddOut, - framework::Tensor* dOut, const framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad")); - // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx - // calculate dy first, so ddy can inplace ddx - if (dOut) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad")); - dout.device(*d) = dx * ddx * static_cast(-1) / out; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad")); - ddout.device(*d) = ddx * static_cast(0.5) / out; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct RsqrtGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, framework::Tensor* ddOut, - framework::Tensor* dOut, const framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad")); - - // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx - if (dOut) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad")); - dout.device(*d) = (static_cast(3.0) / out) * dx * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad")); - ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct SquareGradGradFunctor : public BaseActivationFunctor { template @@ -988,6 +698,29 @@ class SquareDoubleGradKernel } }; +template +struct SoftsignFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x / (static_cast(1) + x.abs()); + } +}; + +// d(softsign(x))/dx = 1 / (1 + |x|)^2 +// Taken from https://en.wikipedia.org/wiki/Activation_function + +template +struct SoftsignGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + template class CELUDoubleGradKernel : public framework::OpKernel { @@ -1135,57 +868,10 @@ class RsqrtDoubleGradKernel } }; -template -class LogitKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto eps = context.Attr("eps"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - auto eigen_p = framework::EigenVector::Flatten(*out); - - LogitFunctor functor; - functor(place, eigen_in, eigen_out, eigen_p, eps); - } -}; - -template -class LogitGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto eps = context.Attr("eps"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - auto eigen_p = framework::EigenVector::Flatten(*x); - - LogitGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps); - } -}; - } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(mish, Mish, MishFunctor, MishGradFunctor); +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 5118302f77..e33351520e 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -20,140 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReciprocalFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // reciprocal(x) = 1 / x - __device__ __forceinline__ T operator()(const T x) const { return one / x; } -}; - -template -struct CudaReciprocalGradFunctor : public BaseActivationFunctor { - // dx = -dout * out^2 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return -dout * out * out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaExpFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // exp(x) = exp(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(exp(x)); - } -}; - -template -struct CudaExpGradFunctor : public BaseActivationFunctor { - // dx = dout * out - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaExpm1Functor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // expm1(x) = expm1(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(expm1(x)); - } -}; - -template -struct CudaExpm1GradFunctor : public BaseActivationFunctor { - // dx = dout * out - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out + dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaSquareFunctor : public BaseActivationFunctor { - // square(x) = x * x - __device__ __forceinline__ T operator()(const T x) const { return x * x; } -}; - -template -struct CudaSquareGradFunctor : public BaseActivationFunctor { - T two = static_cast(2.0f); - - // dx = dout * 2 * x - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout * two * x; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSqrtFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sqrt(x) = sqrt(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sqrt(x)); - } -}; - -template -struct CudaSqrtGradFunctor : public BaseActivationFunctor { - T one_half = static_cast(0.5f); - - // dx = dout * 0.5 / out - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return one_half * dout / out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaRsqrtFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // rsqrt(x) = rsqrt(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(rsqrt(x)); - } -}; - -template -struct CudaRsqrtGradFunctor : public BaseActivationFunctor { - T minus_one_half = static_cast(-0.5f); - - // dx = -0.5 * dout * out^3 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return minus_one_half * dout * out * out * out; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaSoftReluFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -201,119 +67,6 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaSTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - float scale_a; - float scale_b; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; - } - - // stanh(x) = b * tanh(a * x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType a = static_cast(scale_a); - MPType b = static_cast(scale_b); - return static_cast(b * tanh(a * x)); - } -}; - -template -struct CudaSTanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - float scale_a; - float scale_b; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; - } - - // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType a = static_cast(scale_a); - MPType b = static_cast(scale_b); - MPType temp = tanh(a * x); - return static_cast(dout * a * b * (one - temp * temp)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSoftplusFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - float beta; - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"beta", &beta}, {"threshold", &threshold}}; - } - - // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType b = static_cast(beta); - MPType t = static_cast(threshold); - MPType x_beta = x * beta; - return static_cast(x_beta > t ? x : log(one + exp(x_beta)) / b); - } -}; - -template -struct CudaSoftplusGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - float beta; - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"beta", &beta}, {"threshold", &threshold}}; - } - - // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType b = static_cast(beta); - MPType t = static_cast(threshold); - MPType x_beta = x * beta; - return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaSoftsignFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // softsign(x) = x / (1 + abs(x)) - __device__ __forceinline__ T operator()(const T x) const { - return x / (one + abs(x)); - } -}; - -template -struct CudaSoftsignGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + abs(x))^2 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T temp = one + abs(x); - return dout / (temp * temp); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaRelu6Functor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -351,49 +104,23 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { }; template -struct CudaMishFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } +struct CudaSoftsignFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); - // mish(x) = x * tanh(softplus(x)) - // softplus(x) = x, if x > threshold - // = ln(1 + exp(x)), otherwise - // Inputs: args[0], the input x - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); - return static_cast(x * tanh(sp)); + // softsign(x) = x / (1 + abs(x)) + __device__ __forceinline__ T operator()(const T x) const { + return x / (one + abs(x)); } }; template -struct CudaMishGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } +struct CudaSoftsignGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); - // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp))) - // sp = softplus(x) - // Inputs: args[0], the input dout - // args[1], the input x - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); - MPType gsp = - (x > static_cast(threshold)) ? one : one / (one + exp(-x)); - MPType tsp = tanh(sp); - return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); + // dx = dout / (1 + abs(x))^2 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T temp = one + abs(x); + return dout / (temp * temp); } static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } @@ -558,6 +285,16 @@ using CudaCeilFunctor = phi::funcs::CudaCeilFunctor; template using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor; +USE_PHI_FUNCTOR(CudaExp) +USE_PHI_FUNCTOR(CudaExpm1) +USE_PHI_FUNCTOR(CudaMish) +USE_PHI_FUNCTOR(CudaSTanh) +USE_PHI_FUNCTOR(CudaReciprocal) +USE_PHI_FUNCTOR(CudaSquare) +USE_PHI_FUNCTOR(CudaSqrt) +USE_PHI_FUNCTOR(CudaRsqrt) +USE_PHI_FUNCTOR(CudaSoftplus) + template using CudaELUGradNegativeAlphaFunctor = phi::funcs::CudaELUGradNegativeAlphaFunctor; @@ -636,8 +373,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== sqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( sqrt_grad_grad, @@ -653,8 +388,6 @@ REGISTER_OP_CUDA_KERNEL( /* =========================== rsqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor, - CudaRsqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( rsqrt_grad_grad, @@ -667,8 +400,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== square register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); REGISTER_OP_CUDA_KERNEL( square_grad_grad, @@ -688,75 +419,19 @@ REGISTER_OP_CUDA_KERNEL( /* ========================== logit register ============================ */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - logit, ops::LogitKernel, - ops::LogitKernel, - ops::LogitKernel); -REGISTER_OP_CUDA_KERNEL( - logit_grad, - ops::LogitGradKernel, - ops::LogitGradKernel, - ops::LogitGradKernel); /* ========================================================================== */ /* ========================== exp register ============================ */ -REGISTER_OP_CUDA_KERNEL( - exp, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - exp_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); /* ========================================================================== */ /* ========================== expm1 register ============================ */ - -REGISTER_OP_CUDA_KERNEL( - expm1, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - expm1_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ - CudaSoftShrinkGradFunctor); \ - __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ - CudaReciprocalGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ - __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ - __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ - __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor); \ __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor); \ - __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor, \ - CudaTanhShrinkGradFunctor); \ - __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ - CudaHardShrinkGradFunctor); \ - __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); + __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor); + FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index af1069cb86..df8150b192 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -22,9 +22,9 @@ math_library(sampler DEPS generator) math_library(maxouting) if(WITH_MKLDNN) - math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) + math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler mixed_vector) else() - math_library(selected_rows_functor DEPS selected_rows_utils math_function blas) + math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector) endif() math_library(sequence_padding) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 0ca2529f13..f77287826f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h index ec43ed88cb..141f2127da 100644 --- a/paddle/fluid/operators/temporal_shift_op.h +++ b/paddle/fluid/operators/temporal_shift_op.h @@ -19,56 +19,6 @@ namespace operators { using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; -template -void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw, - const int tchw, const int chw, const int hw, - const int t, const int c1, const int c2) { - int src_it = 0; - for (int i = 0; i < ntchw; i++) { - int it = (i % tchw) / chw; - int ic = (i % chw) / hw; - - if (ic < c1) { - src_it = it - 1; - } else if (ic < c2) { - src_it = it + 1; - } else { - src_it = it; - } - - if (src_it < 0 || src_it >= t) { - output[i] = 0; - } else { - output[i] = input[i + (src_it - it) * chw]; - } - } -} - -template -void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc, - const int thwc, const int hwc, const int t, - const int c, const int c1, const int c2) { - int src_it = 0; - for (int i = 0; i < nthwc; i++) { - int it = (i % thwc) / hwc; - int ic = i % c; - - if (ic < c1) { - src_it = it - 1; - } else if (ic < c2) { - src_it = it + 1; - } else { - src_it = it; - } - - if (src_it < 0 || src_it >= t) { - output[i] = 0; - } else { - output[i] = input[i + (src_it - it) * hwc]; - } - } -} - template void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw, const int tchw, const int chw, const int hw, @@ -122,45 +72,7 @@ void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc, template class TemporalShiftKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int t = ctx.Attr("seg_num"); - float shift_ratio = ctx.Attr("shift_ratio"); - const std::string data_format_str = ctx.Attr("data_format"); - const DataLayout data_layout = - framework::StringToDataLayout(data_format_str); - - const int nt = input->dims()[0]; - const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1] - : input->dims()[3]); - const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2] - : input->dims()[1]); - const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3] - : input->dims()[2]); - - const int hw = h * w; - const int chw = c * hw; - const int tchw = t * chw; - const int ntchw = nt * chw; - - const int c1 = static_cast(c * shift_ratio); - const int c2 = static_cast(c * 2 * shift_ratio); - - framework::DDim out_dims = - (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) - : phi::make_ddim({nt, h, w, c})); - const T* input_data = input->data(); - T* output_data = output->mutable_data(out_dims, ctx.GetPlace()); - - if (data_layout == DataLayout::kNCHW) { - TemporalShiftFwNCHW(input_data, output_data, ntchw, tchw, chw, hw, t, - c1, c2); - } else { - TemporalShiftFwNHWC(input_data, output_data, ntchw, tchw, chw, t, c, - c1, c2); - } - } + void Compute(const framework::ExecutionContext& ctx) const override {} }; template diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 84c46870e0..8a40bacd39 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -53,6 +53,13 @@ DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) +DECLARE_ACTIVATION_KERNEL(Exp) +DECLARE_ACTIVATION_KERNEL(Expm1) +DECLARE_ACTIVATION_KERNEL(Reciprocal) +DECLARE_ACTIVATION_KERNEL(Square) +DECLARE_ACTIVATION_KERNEL(Sqrt) +DECLARE_ACTIVATION_KERNEL(Rsqrt) + DECLARE_ACTIVATION_KERNEL(TanhShrink) DECLARE_ACTIVATION_KERNEL(Silu) DECLARE_ACTIVATION_KERNEL(Sigmoid) @@ -73,8 +80,23 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold) + +template +void LogitKernel(const Context& dev_ctx, + const DenseTensor& x, + float eps, + DenseTensor* out); + +template +void MishKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + DenseTensor* out); + template void HardSwishKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index be0d02e2a1..d2b816de8f 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -129,6 +129,13 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, SquareGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor); @@ -157,11 +164,24 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, + MishGradFunctor, + threshold); + DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, BReluGradFunctor, t_min, t_max); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, + STanhGradFunctor, + scale_a, + scale_b); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, + SoftplusGradFunctor, + beta, + threshold); DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, HardSigmoidGradFunctor, slope, @@ -247,6 +267,12 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, ReluDoubleGradKernel) @@ -263,6 +289,34 @@ PD_REGISTER_KERNEL(tanh_triple_grad, float, double, phi::dtype::float16) {} + +PD_REGISTER_KERNEL(exp_grad, + CPU, + ALL_LAYOUT, + phi::ExpGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(expm1_grad, + CPU, + ALL_LAYOUT, + phi::Expm1GradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL( + logit_grad, CPU, ALL_LAYOUT, phi::LogitGradKernel, float, double) {} +PD_REGISTER_KERNEL(square_grad, + CPU, + ALL_LAYOUT, + phi::SquareGradKernel, + float, + double, + int, + int64_t) {} PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index d55d4cfd0f..fe0643286c 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/activation_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" #include "paddle/phi/kernels/impl/activation_impl.h" namespace phi { @@ -72,6 +73,12 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Exp, ExpFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Expm1, Expm1Functor) +DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, ReciprocalFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor) DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor) @@ -83,15 +90,19 @@ DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) + DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, ThresholdedReluFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta) -DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, HardSigmoidFunctor, slope, @@ -139,6 +150,25 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) +PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel) +PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel) + +PD_REGISTER_KERNEL( + exp, CPU, ALL_LAYOUT, phi::ExpKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(expm1, + CPU, + ALL_LAYOUT, + phi::Expm1Kernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {} +PD_REGISTER_KERNEL( + square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {} PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc new file mode 100644 index 0000000000..400f7e8783 --- /dev/null +++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/temporal_shift_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TemporalShiftBwNCHW(const T* output_grad, + T* input_grad, + const int ntchw, + const int tchw, + const int chw, + const int hw, + const int t, + const int c1, + const int c2) { + int src_it = 0; + for (int i = 0; i < ntchw; i++) { + int it = (i % tchw) / chw; + int ic = (i % chw) / hw; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[i] = output_grad[i + (src_it - it) * chw]; + } else { + input_grad[i] = 0; + } + } +} + +template +void TemporalShiftBwNHWC(const T* output_grad, + T* input_grad, + const int nthwc, + const int thwc, + const int hwc, + const int t, + const int c, + const int c1, + const int c2) { + int src_it = 0; + for (int i = 0; i < nthwc; i++) { + int it = (i % thwc) / hwc; + int ic = i % c; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[i] = output_grad[i + (src_it - it) * hwc]; + } else { + input_grad[i] = 0; + } + } +} + +template +void TemporalShiftGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + int seg_num, + float shift_ratio, + const std::string& data_format_str, + DenseTensor* x_grad) { + auto* input_grad = x_grad; + auto* output_grad = &out_grad; + int t = seg_num; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format_str); + + const int nt = output_grad->dims()[0]; + const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1] + : output_grad->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2] + : output_grad->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3] + : output_grad->dims()[2]); + + const int hw = h * w; + const int chw = c * hw; + const int tchw = t * chw; + const int ntchw = nt * chw; + + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + DDim in_grad_dims = + (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) + : phi::make_ddim({nt, h, w, c})); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = + input_grad->mutable_data(in_grad_dims, dev_ctx.GetPlace()); + + if (data_layout == DataLayout::kNCHW) { + TemporalShiftBwNCHW( + output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + TemporalShiftBwNHWC( + output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(temporal_shift_grad, + CPU, + ALL_LAYOUT, + phi::TemporalShiftGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc new file mode 100644 index 0000000000..6721117992 --- /dev/null +++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/temporal_shift_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TemporalShiftFwNCHW(const T* input, + T* output, + const int ntchw, + const int tchw, + const int chw, + const int hw, + const int t, + const int c1, + const int c2) { + int src_it = 0; + for (int i = 0; i < ntchw; i++) { + int it = (i % tchw) / chw; + int ic = (i % chw) / hw; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[i] = 0; + } else { + output[i] = input[i + (src_it - it) * chw]; + } + } +} + +template +void TemporalShiftFwNHWC(const T* input, + T* output, + const int nthwc, + const int thwc, + const int hwc, + const int t, + const int c, + const int c1, + const int c2) { + int src_it = 0; + for (int i = 0; i < nthwc; i++) { + int it = (i % thwc) / hwc; + int ic = i % c; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[i] = 0; + } else { + output[i] = input[i + (src_it - it) * hwc]; + } + } +} + +template +void TemporalShiftKernel(const Context& dev_ctx, + const DenseTensor& x, + int seg_num, + float shift_ratio, + const std::string& data_format_str, + DenseTensor* out) { + auto* input = &x; + auto* output = out; + int t = seg_num; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format_str); + + const int nt = input->dims()[0]; + const int c = + (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]); + const int h = + (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]); + const int w = + (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]); + + const int hw = h * w; + const int chw = c * hw; + const int tchw = t * chw; + const int ntchw = nt * chw; + + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + DDim out_dims = + (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) + : phi::make_ddim({nt, h, w, c})); + const T* input_data = input->data(); + T* output_data = output->mutable_data(out_dims, dev_ctx.GetPlace()); + + if (data_layout == DataLayout::kNCHW) { + TemporalShiftFwNCHW( + input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + TemporalShiftFwNHWC( + input_data, output_data, ntchw, tchw, chw, t, c, c1, c2); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + temporal_shift, CPU, ALL_LAYOUT, phi::TemporalShiftKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index bcadc59126..eee6cf5640 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -106,6 +106,31 @@ struct SinFunctor : public BaseActivationFunctor { } }; +// reciprocal(x) = 1 / x +template +struct ReciprocalFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / x; + } +}; + +template +struct ReciprocalGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(-1) * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + // cosine'(x) = -sin(x) template struct CosGradFunctor : public BaseActivationFunctor { @@ -130,6 +155,108 @@ struct CosFunctor : public BaseActivationFunctor { } }; +template +struct LogitFunctor { + template + void operator()(Device d, X x, Out out, P p, float eps) const { + // logit(x) = ln(x/(1-x)) + auto tmp_x = + (x.cwiseMin(static_cast(1.0 - eps))).cwiseMax(static_cast(eps)); + + if (!eps) { + out.device(d) = (x < static_cast(0.0) || x > static_cast(1.0)) + .select(p.constant(static_cast(NAN)), + (tmp_x / (static_cast(1) - tmp_x)).log()); + } else { + out.device(d) = (tmp_x / (static_cast(1) - tmp_x)).log(); + } + } +}; + +// mish(x) = x * tanh(softplus(x)) +// softplus(x) = x, if x > threshold +// = ln(1 + exp(x)), otherwise + +template +struct MishFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto sp = (x > static_cast(threshold)) + .select(x, (static_cast(1) + x.exp()).log()); + out.device(d) = x * sp.tanh(); + } +}; + +// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp))) +// sp = softplus(x) + +template +struct MishGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto sp = (x > static_cast(threshold)) + .select(x, (static_cast(1) + x.exp()).log()); + auto gsp = static_cast(1) - (-sp).exp(); + auto tsp = sp.tanh(); + dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct STanhFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); + } +}; + +template +struct STanhGradFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto a = static_cast(scale_a); + auto b = static_cast(scale_b); + auto temp = (a * x).tanh() * (a * x).tanh(); + dx.device(d) = dout * a * b * (static_cast(1) - temp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + template struct Tangent { HOSTDEVICE T operator()(const T& val) const { return tan(val); } @@ -157,6 +284,132 @@ struct TanGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +// square(x) = x^2 +template +struct SquareFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.square(); + } +}; + +template +struct SquareGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(2) * x; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// sqrt(x) = x^(1/2) +template +struct SqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.sqrt(); + } +}; + +template +struct SqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(0.5) * dout / out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// rsqrt(x) = x^(-1/2) +template +struct RsqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.rsqrt(); + } +}; + +template +struct RsqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(-0.5) * dout * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// // For numerical stability, using the following formula instead of +// softplus(x) = +// // log(1 + exp(x)) +// // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= +// threshold(beta = +// // 1, threshold = 20 by default), otherwise x + +template +struct SoftplusFunctor : public BaseActivationFunctor { + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto x_beta = static_cast(beta) * x; + out.device(d) = (x_beta > static_cast(threshold)) + .select(x, + (static_cast(1) + x_beta.exp()).log() / + static_cast(beta)); + } +}; + +// For numerical stability, using the following formula instead of +// d(softplus(x))/dx = 1 / (1 + exp(-x)) +// d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta +// = 1, threshold = 20 by default), otherwise x + +template +struct SoftplusGradFunctor : public BaseActivationFunctor { + float beta; + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto x_beta = static_cast(beta) * x; + dx.device(d) = + (x_beta > static_cast(threshold)) + .select(dout, dout / (static_cast(1) + (-x_beta).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + // Tangent(x) = tan(x) template struct TanFunctor : public BaseActivationFunctor { @@ -348,6 +601,18 @@ struct AtanGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct LogitGradFunctor { + template + void operator()(Device d, X x, dOut dout, dX dx, P p, float eps) const { + // logit(x)' = 1/(x*(1-x)) + dx.device(d) = + (x < static_cast(eps) || x > static_cast(1.0 - eps)) + .select(p.constant(static_cast(0)), + dout * (static_cast(1) / ((static_cast(1) - x) * x))); + } +}; + template struct Acosh { HOSTDEVICE T operator()(const T& val) const { return acosh(val); } @@ -458,6 +723,57 @@ struct AtanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +// exp functor +// exp(x) = e^x +template +struct ExpFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.exp(); + } +}; + +template +struct ExpGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// expm1(x) = e^x - 1 +template +struct Expm1Functor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.expm1(); + } +}; + +template +struct Expm1GradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out + dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + // relu(x) = max(x, 0) template struct ReluCPUFunctor : public BaseActivationFunctor { @@ -1560,6 +1876,90 @@ struct CudaCosGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaExpFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // exp(x) = exp(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(exp(x)); + } +}; + +template +struct CudaSquareFunctor : public BaseActivationFunctor { + // square(x) = x * x + __device__ __forceinline__ T operator()(const T x) const { return x * x; } +}; + +template +struct CudaSquareGradFunctor : public BaseActivationFunctor { + T two = static_cast(2.0f); + + // dx = dout * 2 * x + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return dout * two * x; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaExpGradFunctor : public BaseActivationFunctor { + // dx = dout * out + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaReciprocalFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // reciprocal(x) = 1 / x + __device__ __forceinline__ T operator()(const T x) const { return one / x; } +}; + +template +struct CudaReciprocalGradFunctor : public BaseActivationFunctor { + // dx = -dout * out^2 + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return -dout * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaExpm1Functor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // expm1(x) = expm1(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(expm1(x)); + } +}; + +template +struct CudaExpm1GradFunctor : public BaseActivationFunctor { + // dx = dout * out + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out + dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + template struct CudaSinFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; @@ -1782,6 +2182,96 @@ struct CudaAtanhFunctor : public BaseActivationFunctor { } }; +template +struct CudaSTanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + float scale_a; + float scale_b; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + // stanh(x) = b * tanh(a * x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + return static_cast(b * tanh(a * x)); + } +}; + +template +struct CudaSTanhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float scale_a; + float scale_b; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + MPType temp = tanh(a * x); + return static_cast(dout * a * b * (one - temp * temp)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSoftplusFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return static_cast(x_beta > t ? x : log(one + exp(x_beta)) / b); + } +}; + +template +struct CudaSoftplusGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + template struct CudaAtanhGradFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; @@ -1797,6 +2287,56 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaSqrtFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // sqrt(x) = sqrt(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(sqrt(x)); + } +}; + +template +struct CudaSqrtGradFunctor : public BaseActivationFunctor { + T one_half = static_cast(0.5f); + + // dx = dout * 0.5 / out + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return one_half * dout / out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaRsqrtFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // rsqrt(x) = rsqrt(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(rsqrt(x)); + } +}; + +template +struct CudaRsqrtGradFunctor : public BaseActivationFunctor { + T minus_one_half = static_cast(-0.5f); + + // dx = -0.5 * dout * out^3 + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return minus_one_half * dout * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + template struct CudaAtanFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; @@ -1864,6 +2404,55 @@ struct CudaBReluFunctor : public BaseActivationFunctor { } }; +template +struct CudaMishFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // mish(x) = x * tanh(softplus(x)) + // softplus(x) = x, if x > threshold + // = ln(1 + exp(x)), otherwise + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); + return static_cast(x * tanh(sp)); + } +}; + +template +struct CudaMishGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp))) + // sp = softplus(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType sp = (x > static_cast(threshold)) ? x : log(one + exp(x)); + MPType gsp = + (x > static_cast(threshold)) ? one : one / (one + exp(-x)); + MPType tsp = tanh(sp); + return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + template struct CudaBReluGradFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 3c8b338d86..944cd2ea10 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -189,6 +189,13 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, CudaSquareGradFunctor); + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor); @@ -211,11 +218,24 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, CudaSwishGradFunctor, beta); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, + CudaMishGradFunctor, + threshold); + DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, CudaBReluGradFunctor, t_min, t_max); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, + CudaSTanhGradFunctor, + scale_a, + scale_b); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, + CudaSoftplusGradFunctor, + beta, + threshold); DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, CudaHardSigmoidGradFunctor, slope, @@ -326,12 +346,57 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel) + +PD_REGISTER_KERNEL(exp_grad, + GPU, + ALL_LAYOUT, + phi::ExpGradKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) + +PD_REGISTER_KERNEL(expm1_grad, + GPU, + ALL_LAYOUT, + phi::Expm1GradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(logit_grad, + GPU, + ALL_LAYOUT, + phi::LogitGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(square_grad, + GPU, + ALL_LAYOUT, + phi::SquareGradKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} + PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 75003cf342..8cc546ba73 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" #include "paddle/phi/kernels/impl/activation_impl.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" @@ -91,6 +92,12 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Exp, CudaExpFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Expm1, CudaExpm1Functor) +DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor) DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor) @@ -112,7 +119,14 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold) + DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, + CudaSoftplusFunctor, + beta, + threshold) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, CudaHardSigmoidFunctor, slope, @@ -180,6 +194,46 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) +PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel) +PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel) + +PD_REGISTER_KERNEL(exp, + GPU, + ALL_LAYOUT, + phi::ExpKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(expm1, + GPU, + ALL_LAYOUT, + phi::Expm1Kernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(logit, + GPU, + ALL_LAYOUT, + phi::LogitKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(square, + GPU, + ALL_LAYOUT, + phi::SquareKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu new file mode 100644 index 0000000000..065b1726dc --- /dev/null +++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/temporal_shift_grad_kernel.h" + +namespace phi { + +template +__global__ void KeTemporalShiftBwNCHW(const T* output_grad, + T* input_grad, + const int ntchw, + const int tchw, + const int chw, + const int hw, + const int t, + const int c1, + const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < ntchw; tid += stride) { + int it = (tid % tchw) / chw; + int ic = (tid % chw) / hw; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[tid] = output_grad[tid + (src_it - it) * chw]; + } else { + input_grad[tid] = 0; + } + } +} + +template +__global__ void KeTemporalShiftBwNHWC(const T* output_grad, + T* input_grad, + const int nthwc, + const int thwc, + const int hwc, + const int t, + const int c, + const int c1, + const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < nthwc; tid += stride) { + int it = (tid % thwc) / hwc; + int ic = tid % c; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[tid] = output_grad[tid + (src_it - it) * hwc]; + } else { + input_grad[tid] = 0; + } + } +} + +template +void TemporalShiftGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + int seg_num, + float shift_ratio, + const std::string& data_format_str, + DenseTensor* x_grad) { + auto* input_grad = x_grad; + auto* output_grad = &out_grad; + int t = seg_num; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format_str); + + const int nt = output_grad->dims()[0]; + const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1] + : output_grad->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2] + : output_grad->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3] + : output_grad->dims()[2]); + + const int hw = h * w; + const int chw = c * hw; + const int tchw = t * chw; + const int ntchw = nt * chw; + + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + DDim in_grad_dims = + (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) + : phi::make_ddim({nt, h, w, c})); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = + input_grad->mutable_data(in_grad_dims, dev_ctx.GetPlace()); + + int pixelNum = nt * chw; + int threads = 1024; + int grid = (pixelNum + threads - 1) / threads; + int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads; + grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); + + if (data_layout == DataLayout::kNCHW) { + KeTemporalShiftBwNCHW<<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + KeTemporalShiftBwNHWC<<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(temporal_shift_grad, + GPU, + ALL_LAYOUT, + phi::TemporalShiftGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu new file mode 100644 index 0000000000..34d80a1bc8 --- /dev/null +++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/temporal_shift_kernel.h" + +namespace phi { + +template +__global__ void KeTemporalShiftFwNCHW(const T* input, + T* output, + const int ntchw, + const int tchw, + const int chw, + const int hw, + const int t, + const int c1, + const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < ntchw; tid += stride) { + int it = (tid % tchw) / chw; + int ic = (tid % chw) / hw; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[tid] = 0; + } else { + output[tid] = input[tid + (src_it - it) * chw]; + } + } +} + +template +__global__ void KeTemporalShiftFwNHWC(const T* input, + T* output, + const int nthwc, + const int thwc, + const int hwc, + const int t, + const int c, + const int c1, + const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < nthwc; tid += stride) { + int it = (tid % thwc) / hwc; + int ic = tid % c; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[tid] = 0; + } else { + output[tid] = input[tid + (src_it - it) * hwc]; + } + } +} + +template +void TemporalShiftKernel(const Context& dev_ctx, + const DenseTensor& x, + int seg_num, + float shift_ratio, + const std::string& data_format_str, + DenseTensor* out) { + auto* input = &x; + auto* output = out; + int t = seg_num; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format_str); + + const int nt = input->dims()[0]; + const int c = + (data_layout == DataLayout::kNCHW ? input->dims()[1] : input->dims()[3]); + const int h = + (data_layout == DataLayout::kNCHW ? input->dims()[2] : input->dims()[1]); + const int w = + (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]); + + const int hw = h * w; + const int chw = c * hw; + const int tchw = t * chw; + const int ntchw = nt * chw; + + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + DDim out_dims = + (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) + : phi::make_ddim({nt, h, w, c})); + const T* input_data = input->data(); + T* output_data = output->mutable_data(out_dims, dev_ctx.GetPlace()); + + int pixelNum = nt * chw; + int threads = 1024; + int grid = (pixelNum + threads - 1) / threads; + int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads; + grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); + + if (data_layout == DataLayout::kNCHW) { + KeTemporalShiftFwNCHW<<>>( + input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + KeTemporalShiftFwNHWC<<>>( + input_data, output_data, ntchw, tchw, chw, t, c, c1, c2); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(temporal_shift, + GPU, + ALL_LAYOUT, + phi::TemporalShiftKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index 7924276414..37273b7944 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -222,6 +222,24 @@ void EluDoubleGradKernel(const Context& dev_ctx, functor(dev_ctx, &x, &ddx, ddout, &dout, dx); } +template +void LogitGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + float eps, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + + auto eigen_x = EigenVector::Flatten(x); + auto eigen_dout = EigenVector::Flatten(out_grad); + auto eigen_dx = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + auto eigen_p = EigenVector::Flatten(x); + + funcs::LogitGradFunctor functor; + functor(place, eigen_x, eigen_dout, eigen_dx, eigen_p, eps); +} + template void SigmoidDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h index c2d160caf7..1a62c4e06b 100644 --- a/paddle/phi/kernels/impl/activation_impl.h +++ b/paddle/phi/kernels/impl/activation_impl.h @@ -47,6 +47,22 @@ void ActivationImpl(const Context& dev_ctx, } } +template +void LogitKernel(const Context& dev_ctx, + const DenseTensor& x, + float eps, + DenseTensor* out) { + dev_ctx.template Alloc(out); + + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_in = EigenVector::Flatten(x); + auto& place = *dev_ctx.eigen_device(); + auto eigen_p = EigenVector::Flatten(*out); + + funcs::LogitFunctor functor; + functor(place, eigen_in, eigen_out, eigen_p, eps); +} + template void PowKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc new file mode 100644 index 0000000000..438a080a63 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/activation_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace phi { +namespace sr { + +template +void SquareKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + out->set_rows(x.rows()); + out->set_height(x.height()); + phi::SquareKernel(dev_ctx, x.value(), out->mutable_value()); +} + +template +void SqrtKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + out->set_rows(x.rows()); + out->set_height(x.height()); + phi::SqrtKernel(dev_ctx, x.value(), out->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL( + square_sr, CPU, ALL_LAYOUT, phi::sr::SquareKernel, float, double) {} + +PD_REGISTER_KERNEL( + sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(square_sr, + GPU, + ALL_LAYOUT, + phi::sr::SquareKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + sqrt_sr, GPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {} + +#endif diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.h b/paddle/phi/kernels/selected_rows/activation_kernel.h new file mode 100644 index 0000000000..6518f95539 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/activation_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void SquareKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out); + +template +void SqrtKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/temporal_shift_grad_kernel.h b/paddle/phi/kernels/temporal_shift_grad_kernel.h new file mode 100644 index 0000000000..1bcd3d61c2 --- /dev/null +++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TemporalShiftGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int seg_num, + float shift_ratio, + const std::string& data_format, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/temporal_shift_kernel.h b/paddle/phi/kernels/temporal_shift_kernel.h new file mode 100644 index 0000000000..a927d7fb23 --- /dev/null +++ b/paddle/phi/kernels/temporal_shift_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TemporalShiftKernel(const Context& ctx, + const DenseTensor& x, + int seg_num, + float shift_ratio, + const std::string& data_format, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 7919769ec8..34f830abe7 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -43,17 +43,19 @@ namespace phi { #define comma , -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", ); // NOLINT + DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, @@ -61,6 +63,7 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT @@ -74,12 +77,41 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish, "offset"); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta"); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh, + "stanh", + "scale_a" comma "scale_b"); // NOLINT + +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus, + "softplus", + "beta" comma "threshold"); // NOLINT + +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", ); // NOLINT + DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid, "hard_sigmoid", "slope" comma "offset"); // NOLINT +KernelSignature SqrtActiOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + return KernelSignature("sqrt", {"X"}, {}, {"Out"}); + } else { + return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"}); + } +} + +KernelSignature SquareActiOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + return KernelSignature("square", {"X"}, {}, {"Out"}); + } else { + return KernelSignature("square_sr", {"X"}, {}, {"Out"}); + } +} DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Round, "round", ); // NOLINT DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Floor, "floor", ); // NOLINT @@ -132,6 +164,11 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); } +KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "logit_grad", {"X", GradVarName("Out")}, {"eps"}, {GradVarName("X")}); +} + KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("elu_grad", {"X", "Out", GradVarName("Out")}, @@ -194,6 +231,18 @@ PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad, + phi::ReciprocalGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mish_grad, phi::MishGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping); + PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); @@ -228,11 +277,16 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, phi::LogSigmoidGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, phi::HardSigmoidGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtActiOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareActiOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad, phi::HardSwishGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/temporal_shift_sig.cc b/paddle/phi/ops/compat/temporal_shift_sig.cc new file mode 100644 index 0000000000..a686c37ff7 --- /dev/null +++ b/paddle/phi/ops/compat/temporal_shift_sig.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TemporalShiftOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("temporal_shift", + {"X"}, + {"seg_num", "shift_ratio", "data_format"}, + {"Out"}); +} + +KernelSignature TemporalShiftGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("temporal_shift_grad", + {GradVarName("Out")}, + {"seg_num", "shift_ratio", "data_format"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad, + phi::TemporalShiftGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 825d74388b..eb4243ef1c 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -342,4 +342,5 @@ class TestLogDoubleGradCheck(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py new file mode 100644 index 0000000000..5c07a544ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest +import paddle + + +class TestSparseSquareOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + self.row_numel = 12 + + x_selected_rows = scope.var('X').get_selected_rows() + x_selected_rows.set_height(height) + x_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), self.row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + x_tensor = x_selected_rows.get_tensor() + x_tensor.set(np_array, place) + + out_selected_rows = scope.var('Out').get_selected_rows() + # create and run sqrt operator + square_op = Operator("square", X='X', Out='Out') + square_op.run(scope, place) + + # get and compare result + result_array = np.array(out_selected_rows.get_tensor()) + + self.assertTrue(np.array_equal(result_array, np.square(np_array))) + + def test_sparse_acti(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +class TestSparseSqrtOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + self.row_numel = 12 + + x_selected_rows = scope.var('X1').get_selected_rows() + x_selected_rows.set_height(height) + x_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), self.row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + x_tensor = x_selected_rows.get_tensor() + x_tensor.set(np_array, place) + + out_selected_rows = scope.var('Out1').get_selected_rows() + # create and run sqrt operator + sqrt_op = Operator("sqrt", X='X1', Out='Out1') + sqrt_op.run(scope, place) + + # get and compare result + result_array = np.array(out_selected_rows.get_tensor()) + self.assertTrue(np.allclose(result_array, np.sqrt(np_array))) + + def test_sparse_acti(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index de7aaf9479..7f137cf137 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle from op_test import OpTest import paddle.fluid as fluid @@ -153,4 +154,5 @@ class TestClipByNormOpWithSelectedRows(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py index 5bab4a52bf..407a252e1a 100644 --- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py +++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py @@ -143,4 +143,5 @@ class TestTemporalShiftAPI(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() -- GitLab