From 21beb082e9aac6936f0199265afb7f9e9d56af14 Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 17 Mar 2022 05:29:40 +0000 Subject: [PATCH] add some grad kernel; test=develop --- paddle/fluid/operators/activation_op.h | 13 ---- paddle/fluid/operators/activation_op.kps | 12 ---- .../phi/kernels/cpu/activation_grad_kernel.cc | 9 +++ paddle/phi/kernels/cpu/activation_kernel.cc | 4 +- paddle/phi/kernels/funcs/activation_functor.h | 71 ++++++++++++++++--- .../phi/kernels/gpu/activation_grad_kernel.cu | 10 +++ 6 files changed, 81 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 526d35af6e0..98c73c3cb3f 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -440,19 +440,6 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct Expm1GradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out + dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - // relu(x) = max(x, 0) template diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 17476c48e41..e63bf6ec47e 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -140,18 +140,6 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaExpm1GradFunctor : public BaseActivationFunctor { - // dx = dout * out - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out + dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaLogFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index 5b397e7774a..d3d6989696b 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -104,6 +104,7 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); @@ -214,3 +215,11 @@ PD_REGISTER_KERNEL(exp_grad, double, int, int64_t) {} + +PD_REGISTER_KERNEL(expm1_grad, + CPU, + ALL_LAYOUT, + phi::Expm1GradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index edad9734cce..46a85a2b9b2 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -120,12 +120,12 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) -// PD_REGISTER_ACTIVATION_KERNEL(mish, Mish) +PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel) PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel) PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel) -// PD_REGISTER_ACTIVATION_KERNEL(softplus, Softplus) +PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel) PD_REGISTER_ACTIVATION_KERNEL(softsign, SoftsignKernel) PD_REGISTER_KERNEL( diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 372834d657a..eb98ee2ea26 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -157,9 +157,10 @@ struct LogitFunctor { } }; -// mish(x) = x * tanh(softplus(x)) -// softplus(x) = x, if x > threshold -// = ln(1 + exp(x)), otherwise +// // mish(x) = x * tanh(softplus(x)) +// // softplus(x) = x, if x > threshold +// // = ln(1 + exp(x)), otherwise + template struct MishFunctor : public BaseActivationFunctor { float threshold; @@ -168,7 +169,7 @@ struct MishFunctor : public BaseActivationFunctor { } template - void operator()(Device d, X x, Out out) { + void operator()(Device d, X x, Out out) const { auto sp = (x > static_cast(threshold)) .select(x, (static_cast(1) + x.exp()).log()); out.device(d) = x * sp.tanh(); @@ -244,20 +245,41 @@ struct RsqrtFunctor : public BaseActivationFunctor { } }; -// For numerical stability, using the following formula instead of softplus(x) = -// log(1 + exp(x)) -// softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta = -// 1, threshold = 20 by default), otherwise x +// // For numerical stability, using the following formula instead of +// softplus(x) = +// // log(1 + exp(x)) +// // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= +// threshold(beta = +// // 1, threshold = 20 by default), otherwise x +// template +// struct SoftplusFunctor : public BaseActivationFunctor { +// float beta; +// float threshold; +// typename BaseActivationFunctor::AttrPair GetAttrs() { +// return {{"beta", &beta}, {"threshold", &threshold}}; +// } + +// template +// void operator()(Device d, X x, Out out) { +// auto x_beta = static_cast(beta) * x; +// out.device(d) = (x_beta > static_cast(threshold)) +// .select(x, +// (static_cast(1) + x_beta.exp()).log() / +// static_cast(beta)); +// } +// }; + template struct SoftplusFunctor : public BaseActivationFunctor { float beta; float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"beta", &beta}, {"threshold", &threshold}}; } template - void operator()(Device d, X x, Out out) { + void operator()(Device d, X x, Out out) const { auto x_beta = static_cast(beta) * x; out.device(d) = (x_beta > static_cast(threshold)) .select(x, @@ -602,6 +624,22 @@ struct Expm1Functor : public BaseActivationFunctor { } }; +template +struct Expm1GradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out + dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + // relu(x) = max(x, 0) template struct ReluCPUFunctor : public BaseActivationFunctor { @@ -822,11 +860,10 @@ struct BReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -// softsign(x) = x / (1 + |x|) template struct SoftsignFunctor : public BaseActivationFunctor { template - void operator()(Device d, X x, Out out) { + void operator()(Device d, X x, Out out) const { out.device(d) = x / (static_cast(1) + x.abs()); } }; @@ -1264,6 +1301,18 @@ struct CudaExpm1Functor : public BaseActivationFunctor { } }; +template +struct CudaExpm1GradFunctor : public BaseActivationFunctor { + // dx = dout * out + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out + dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + template struct CudaSinFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index a7f89ff85d6..00c08ff497d 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -158,6 +158,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, CudaExpGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor); DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, @@ -274,9 +275,18 @@ PD_REGISTER_KERNEL(exp_grad, double, int, int64_t) {} + PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) + +PD_REGISTER_KERNEL(expm1_grad, + GPU, + ALL_LAYOUT, + phi::Expm1GradKernel, + float, + double, + phi::dtype::float16) {} -- GitLab