diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 72f57b661e911afb59198704b6601edca955aa21..62b805cf422d95db1d6d0dd4f8295795551da13d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1513,6 +1513,10 @@ REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); +REGISTER_ACTIVATION_OP(softsign, + Softsign, + SoftsignFunctor, + SoftsignGradFunctor); REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 26ee9eb11bc8cdffc98b7b400376f481786d0deb..eeec8c300daaca92c16ea2f39d7bdfc73990422a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -290,6 +290,7 @@ USE_PHI_FUNCTOR(TanhShrink) USE_PHI_FUNCTOR(Silu) USE_PHI_FUNCTOR(ELU) USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) +USE_PHI_FUNCTOR(Softsign) USE_PHI_FUNCTOR(Sigmoid) USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid) USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid) @@ -493,35 +494,8 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } -template -struct SoftsignFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x / (static_cast(1) + x.abs()); - } -}; - -// d(softsign(x))/dx = 1 / (1 + |x|)^2 -// Taken from https://en.wikipedia.org/wiki/Activation_function - -template -struct SoftsignGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 4e56721cb30e59a7f056341196e68287f02bb443..9aafb70c7dce576374cc90fe7abdc5ef2aff7eac 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -66,29 +66,6 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaSoftsignFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // softsign(x) = x / (1 + abs(x)) - __device__ __forceinline__ T operator()(const T x) const { - return x / (one + abs(x)); - } -}; - -template -struct CudaSoftsignGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + abs(x))^2 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T temp = one + abs(x); - return dout / (temp * temp); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template class ActivationCudaKernel : public framework::OpKernel { @@ -174,6 +151,7 @@ USE_PHI_FUNCTOR(CudaSoftShrink) USE_PHI_FUNCTOR(CudaTanhShrink) USE_PHI_FUNCTOR(CudaSilu) USE_PHI_FUNCTOR(CudaELU) +USE_PHI_FUNCTOR(CudaSoftsign) USE_PHI_FUNCTOR(CudaSigmoid) USE_PHI_FUNCTOR(CudaLogSigmoid) USE_PHI_FUNCTOR(CudaHardSigmoid) diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index dd7ec0af6f11c9d611c8258248d57bd1155317db..14c15157b25c14d5dde022e6c1531bc3e835a642 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -2152,6 +2152,17 @@ use_gpudnn : true backward : softmax_grad +# softsign +- api : softsign + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : softsign + backward : softsign_grad + - api : spectral_norm args : (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) output : Tensor diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 804a653348ce341f5c5ba539845578bfdaeb5567..53cf8c08bd548ec40299fe3b21bfb8695e3b9d02 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -2037,6 +2037,17 @@ func : softmax_grad use_gpudnn : true +- backward_api : softsign_grad + forward : softsign (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : softsign_grad + inplace : (out_grad -> x_grad) + - backward_api : spectral_norm_grad forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out) args : (Tensor weight, Tensor u, Tensor v, Tensor out_grad, int dim, int power_iters, float eps) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index ea3332627883d3209a3420289cd4d49cf2b4401a..46a9830882f6f3ae002b52e692854a8ac338dcbb 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -213,6 +213,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Square); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Softsign); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2); diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 6d0a380e28b19fb6e95c5408157ab10800df5af9..8a83226b23027eafb8c0295f7cd4724fcd1b78ed 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -61,6 +61,7 @@ DECLARE_ACTIVATION_KERNEL(Reciprocal) DECLARE_ACTIVATION_KERNEL(Square) DECLARE_ACTIVATION_KERNEL(Sqrt) DECLARE_ACTIVATION_KERNEL(Rsqrt) +DECLARE_ACTIVATION_KERNEL(Softsign) DECLARE_ACTIVATION_KERNEL(Sigmoid) DECLARE_ACTIVATION_KERNEL(LogSigmoid) DECLARE_ACTIVATION_KERNEL(Log) diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index c498fa48706e72f6077ff77468d88a270ae49542..7927f595927fcb7e3e77cbd1665514a161455fbc 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -136,6 +136,7 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, SoftsignGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor); @@ -335,6 +336,7 @@ PD_REGISTER_KERNEL(square_double_grad, phi::dtype::float16, int, int64_t) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 8560576a4256712898d45ab6f38dc3a384fd5c07..ac75c4ad3479fe3f24b2ddfb3225988a8f76afc1 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -80,6 +80,7 @@ DEFINE_CPU_ACTIVATION_KERNEL(Reciprocal, ReciprocalFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Square, SquareFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Sqrt, SqrtFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Softsign, SoftsignFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor) DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor) @@ -173,6 +174,7 @@ PD_REGISTER_KERNEL(expm1, PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {} PD_REGISTER_KERNEL( square, CPU, ALL_LAYOUT, phi::SquareKernel, float, double, int, int64_t) {} +PD_REGISTER_ACTIVATION_KERNEL(softsign, SoftsignKernel) PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index bd5e3dec3d6b05f7b0b8908484205e7bd93fee91..ee79cafd155dcfbdedf66ca4a56dd299d6cb6ab4 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1364,6 +1364,32 @@ struct SiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct SoftsignFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x / (static_cast(1) + x.abs()); + } +}; + +// d(softsign(x))/dx = 1 / (1 + |x|)^2 +// Taken from https://en.wikipedia.org/wiki/Activation_function + +template +struct SoftsignGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + // sigmoid(x) = 1 / (1 + exp(-x)) template struct SigmoidFunctor : public BaseActivationFunctor { @@ -3019,6 +3045,31 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaSoftsignFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // softsign(x) = x / (1 + abs(x)) + __device__ __forceinline__ T operator()(const T x) const { + // Using abs directly will cause namespace conflict + return x / (one + (x > -x ? x : -x)); + } +}; + +template +struct CudaSoftsignGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + abs(x))^2 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + // Using abs directly will cause namespace conflict + T temp = one + (x > -x ? x : -x); + return dout / (temp * temp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + template struct CudaSigmoidFunctor : public BaseActivationFunctor { using MPType = typename phi::dtype::MPTypeTrait::Type; diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index a53c2a05d83fcd477c63567d8f2b51b7192ab333..53f727ec51a39b3faed9ecf8684790d7e4670694 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -195,6 +195,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor); @@ -415,6 +416,7 @@ PD_REGISTER_KERNEL(square_double_grad, phi::dtype::float16, phi::dtype::bfloat16) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index b7ff76f7446458d4ebd5bb6b2ef8f2a3fe859e4d..6e116a3e157503563123bd096466231747763005 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -97,6 +97,7 @@ DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Square, CudaSquareFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Sqrt, CudaSqrtFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Softsign, CudaSoftsignFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor) DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor) @@ -241,6 +242,7 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(softsign, SoftsignKernel) PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 93717e715690e24ea48920dcf43bedadc5d6bfbd..fff02697f5f88f831d89ae4d90ef2f3bf7b4db8e 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -62,6 +62,7 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softsign, "softsign", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", ); // NOLINT @@ -294,6 +295,7 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softsign_grad, phi::SoftsignGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, phi::SigmoidDoubleGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 4b817bb22ebf764b289c372d9b16868762e427ad..5edb1185ad6445d7af4d8de4f09bc461f62142fb 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2795,6 +2795,7 @@ class TestSoftsign(TestActivation): def setUp(self): self.op_type = "softsign" self.init_dtype() + self.python_api = paddle.nn.functional.softsign np.random.seed(1024) x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) @@ -2805,7 +2806,7 @@ class TestSoftsign(TestActivation): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestSoftsignAPI(unittest.TestCase): diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 7c2dff5247e21b680a40e317b86768b9d56b5c89..f0f04fb133283517f6c73af57c23e74f7dd2975a 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -1275,6 +1275,8 @@ def softsign(x, name=None): x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3])) out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769] """ + if in_dygraph_mode(): + return _C_ops.final_state_softsign(x) if in_dynamic_mode(): return _C_ops.softsign(x)