diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index d578ada0db00fed85f7b4f25f1483169c72c2c0b..ef2e83ced26e07f199a122ee3157eb428b63aec9 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); USE_OP_ITSELF(relu); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); namespace paddle { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 219aae71127ed8963b4bfe4e8ee5e7259dbf7d02..eadb00b9e88e14075c46a53c711fd43774f26581 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -32,7 +32,7 @@ USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); @@ -48,7 +48,7 @@ USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); -USE_OP(tanh_grad); +USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); USE_OP(lookup_table_grad); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7f7313fbcb5969aafea47ad23248acd5a6ca3644..1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } USE_OP_ITSELF(relu); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index 1725888abc379bfa4ffbbc5cfc4cecd1872c7c18..f17e00de0eeb7c8f4d782f0a4eaecc2fc1df268b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { } // namespace paddle // USE_OP(leaky_relu); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 66f1bcc8b68692abe588b6429b027462eaebde24..4205f2253a652ccc5f6d4886df1b1194f5e5062f 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); +REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, ThresholdedReluGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1567,23 +1570,6 @@ REGISTER_OPERATOR( ops::ActivationOpTripleGrad::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); -REGISTER_OP_CPU_KERNEL( - tanh_grad_grad, ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1623,16 +1609,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); -REGISTER_OP_CPU_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ /* ======================== elu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4b79397b6cdf2e5c2993f7a72f512cc924c208e7..b076db01c22c62b17fdd85b7208467eea1375fed 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor { template \ using name##GradFunctor = phi::funcs::name##GradFunctor; +#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \ + template \ + using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; + +#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \ + template \ + using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; + USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Acos) @@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh) USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Atanh) +USE_PHI_FUNCTOR(Tanh) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(BRelu) +USE_PHI_FUNCTOR(ThresholdedRelu) +USE_PHI_FUNCTOR(LeakyRelu) +USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) template struct SigmoidGradFunctor : public BaseActivationFunctor { @@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.tanh(); - } -}; - -template -struct TanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (static_cast(1) - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct TanhGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); - // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out - // * ddx) - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); - dout_new.device(*d) = - static_cast(-1) * dout * static_cast(2) * out * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); - ddout.device(*d) = (static_cast(1) - out * out) * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; -/* - Out - DOut D_Dout - DDx -> TanhTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (-2) * Out * DDx * D_Dout_new - D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new - D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct TanhTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); - d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - - (static_cast(2) * dout * ddx * d_dOutNew); - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); - d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); - d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - - static_cast(2) * out * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct BReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); - } -}; - -template -struct BReluGradFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // relu6(x) = min(max(0, x), 6) template struct Relu6Functor : public BaseActivationFunctor { @@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct LeakyReluFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - if (alpha < 1.f) { - out.device(d) = x.cwiseMax(static_cast(alpha) * x); - } else { - out.device(d) = x.cwiseMin(static_cast(alpha) * x); - } - } -}; - -template -struct LeakyReluGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = - static_cast(alpha) * (x < static_cast(0)).template cast(); - auto temp2 = (x >= static_cast(0)).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUFunctor : public BaseActivationFunctor { float alpha; @@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ThresholdedReluFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto th = static_cast(threshold); - out.device(d) = (x > th).template cast() * x; - } -}; - -template -struct ThresholdedReluGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto th = static_cast(threshold); - dx.device(d) = dout * (x > th).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct HardSigmoidFunctor : public BaseActivationFunctor { float slope; @@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -/* - * in arguments: x, out, ddx - * out arguments: ddout, dout, dx - */ -template -inline void ExtractActivationDoubleGradTensor( - const framework::ExecutionContext& ctx, const framework::Tensor** X, - const framework::Tensor** Out, const framework::Tensor** ddX, - framework::Tensor** dX, framework::Tensor** dOut, - framework::Tensor** ddOut) { - auto ddx_var = ctx.InputVar("DDX"); - auto ddo_var = ctx.OutputVar("DDOut"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("DDX"))); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); - if (ddo_var) { - *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - ddo_var); - } - } else { - *ddX = ctx.Input("DDX"); - if (ddo_var) { - *ddOut = ctx.Output("DDOut"); - } - } - PADDLE_ENFORCE_NOT_NULL( - *ddX, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Output, variable name = %s", - ctx.OutputName("DDX"))); - - if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NOT_NULL( - x_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("X"))); - auto dx_var = ctx.OutputVar("DX"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); - if (dx_var) { - *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dx_var); - } - } else { - *X = ctx.Input("X"); - if (dx_var) { - *dX = ctx.Output("DX"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *X = *ddX; - } - if (static_cast(kDepValue) & - static_cast(ActBwdOpFwdDeps::kDepOut)) { - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Out, variable name = %s", - ctx.InputName("Out"))); - auto dout_var = ctx.OutputVar("DOut"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *Out = - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); - if (dout_var) { - *dOut = - paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dout_var); - } - } else { - *Out = ctx.Input("Out"); - if (dout_var) { - *dOut = ctx.Output("DOut"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *Out = *ddX; - } -} - -template -class ActivationDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *X, *Out, *ddX; - X = Out = ddX = nullptr; - framework::Tensor *ddOut, *dOut, *dX; - ddOut = dOut = dX = nullptr; - - ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, - &dX, &dOut, &ddOut); - - if (ddOut) ddOut->mutable_data(ctx.GetPlace()); - if (dOut) dOut->mutable_data(ctx.GetPlace()); - if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); - - auto& place = ctx.template device_context(); - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - functor(place, X, Out, ddX, ddOut, dOut, dX); - } -}; - template struct AbsGradGradFunctor : public BaseActivationFunctor { template @@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct LeakyReluGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - if (ddOut) { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ @@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 92a101451e211f912e5390171654affa3be4e973..256f20db08445e8b8d5933aa0e3151f69fcb5b10 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,38 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaLeakyReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : static_cast(alpha) * x; - } -}; - -template -struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > zero ? dout : static_cast(alpha) * dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tanh(x)); - } -}; - -template -struct CudaTanhGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * (one - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaBReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - T temp_max = x > t_min_cast ? x : t_min_cast; - T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; - return temp_min; - } -}; - -template -struct CudaBReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - return (x > t_min_cast && x < t_max_cast) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSoftReluFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaThresholdedReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T x) const { - return x > static_cast(threshold) ? x : zero; - } -}; - -template -struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > static_cast(threshold) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSwishFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel } }; +USE_PHI_FUNCTOR(CudaCos) +USE_PHI_FUNCTOR(CudaTan) +USE_PHI_FUNCTOR(CudaAcos) +USE_PHI_FUNCTOR(CudaSin) +USE_PHI_FUNCTOR(CudaAsin) +USE_PHI_FUNCTOR(CudaAtan) +USE_PHI_FUNCTOR(CudaSinh) +USE_PHI_FUNCTOR(CudaCosh) +USE_PHI_FUNCTOR(CudaAsinh) +USE_PHI_FUNCTOR(CudaAcosh) +USE_PHI_FUNCTOR(CudaAtanh) +USE_PHI_FUNCTOR(CudaTanh) +USE_PHI_FUNCTOR(CudaBRelu) +USE_PHI_FUNCTOR(CudaLeakyRelu) +USE_PHI_FUNCTOR(CudaThresholdedRelu) + } // namespace operators } // namespace paddle @@ -1270,20 +1158,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); -/* ========================================================================== */ - /* ======================== elu register ============================ */ REGISTER_OP_CUDA_KERNEL( elu, ops::ActivationCudaKernel>); /* ========================================================================== */ -/* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, - CudaTanhGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - tanh_grad_grad, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL( __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ - __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ @@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL( CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ - CudaThresholdedReluGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index f34e5710ab7294425bacba4e5d5782859ac5f081..a5b737b28c23ba97988915f00cbf447d2e1b1c22 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -39,6 +39,54 @@ void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& ddx, DenseTensor* ddout); +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +template +void BReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float t_min, + float t_max, + DenseTensor* dx); + +template +void LeakyReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout); + +template +void ThresholdedReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float threshold, + DenseTensor* dx); + DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); @@ -51,5 +99,6 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index bdf8f4363598f8c25e6f128b3f38f13f23005828..885dccad8e377642b4cb9e36832ac4bd45f7915f 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -36,5 +36,25 @@ DECLARE_ACTIVATION_KERNEL(Asinh) DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) +DECLARE_ACTIVATION_KERNEL(Tanh) + +template +void BReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float t_min, + float t_max, + DenseTensor* out); + +template +void LeakyReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float alpha, + DenseTensor* out); + +template +void ThresholdedReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index fe43ebb816077432ca4e7f678be4591e5d31b6f7..f9af50f6832a1884f3ef58ccb5708b1f2636ccea 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -27,65 +27,135 @@ namespace phi { const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + funcs::LeakyReluGradFunctor, + alpha); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( + ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + funcs::BReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL( - cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {} PD_REGISTER_KERNEL( relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {} -PD_REGISTER_KERNEL(relu_double_grad, + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} + +#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) + +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, + ReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, + TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) + +PD_REGISTER_KERNEL(tanh_triple_grad, CPU, ALL_LAYOUT, - phi::ReluDoubleGradKernel, + phi::TanhTripleGradKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 51883f25183af7c8013bbfb403404397c8492988..0d13429c8f651ccb40646fddd82a3529a95ab45d 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -27,6 +27,33 @@ namespace phi { ActivationImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) @@ -39,17 +66,31 @@ DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + funcs::ThresholdedReluFunctor, + threshold) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) } // namespace phi -PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {} -PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {} -PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {} -PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {} -PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {} -PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {} -PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {} -PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {} -PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {} -PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {} -PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {} PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) +PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) +PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) +PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) +PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) +PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) +PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) +PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) +PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) +PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) +PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) +PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 1a36e4e132f41720b6f9fc563026082e21971d96..c8fb54bb102d389cf005bac6d0f0edb78fb845ee 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -513,7 +513,270 @@ struct ReluGradGradFunctor : public BaseActivationFunctor { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + if (alpha < 1.f) { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } else { + out.device(d) = x.cwiseMin(static_cast(alpha) * x); + } + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = + static_cast(alpha) * (x < static_cast(0)).template cast(); + auto temp2 = (x >= static_cast(0)).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* ddOut, + DenseTensor* dOut, + DenseTensor* dX) const { + if (ddOut) { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); + ddout.device(*d) = + ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -824,6 +1087,133 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(tanh(x)); + } +}; + +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * (one - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaBReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // brelu(x) = min(max(x, t_min), t_max) + __device__ __forceinline__ T operator()(const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + T temp_max = x > t_min_cast ? x : t_min_cast; + T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; + return temp_min; + } +}; + +template +struct CudaBReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // dx = (x > t_min && x < t_max) ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + return (x > t_min_cast && x < t_max_cast) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaThresholdedReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // thresholded_relu(x) = x > threshold ? x : 0 + __device__ __forceinline__ T operator()(const T x) const { + return x > static_cast(threshold) ? x : zero; + } +}; + +template +struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = x > threshold ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > static_cast(threshold) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + __device__ __forceinline__ T operator()(const T x) const { + return x > zero ? x : static_cast(alpha) * x; + } +}; + +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > zero ? dout : static_cast(alpha) * dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index c2995c79a7e8c2651ed4aa16d75d59c8f24c96dc..00792b8ab607036112295f2dd4018c69eb78680a 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -79,113 +79,97 @@ void ActivationGradGPUImpl(const Context& dev_ctx, const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor); +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + CudaLeakyReluGradFunctor, + alpha); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, + CudaThresholdedReluGradFunctor, + threshold); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + CudaBReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL(cos_grad, - GPU, - ALL_LAYOUT, - phi::CosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(tan_grad, - GPU, - ALL_LAYOUT, - phi::TanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos_grad, - GPU, - ALL_LAYOUT, - phi::AcosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sin_grad, - GPU, - ALL_LAYOUT, - phi::SinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin_grad, - GPU, - ALL_LAYOUT, - phi::AsinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan_grad, - GPU, - ALL_LAYOUT, - phi::AtanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh_grad, - GPU, - ALL_LAYOUT, - phi::SinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh_grad, - GPU, - ALL_LAYOUT, - phi::CoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh_grad, - GPU, - ALL_LAYOUT, - phi::AsinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh_grad, - GPU, - ALL_LAYOUT, - phi::AcoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh_grad, - GPU, - ALL_LAYOUT, - phi::AtanhGradKernel, - float, - double, - phi::dtype::float16) {} + #ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(relu_grad, GPU, @@ -219,3 +203,34 @@ PD_REGISTER_KERNEL(relu_double_grad, phi::dtype::float16, phi::dtype::bfloat16) {} #endif + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 26752b89e7c345f88cdbe2000b119c07507d2c37..3c340a89f5746bd8de31826f7639e6ed0b7391f6 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -46,6 +46,35 @@ void ActivationGPUImpl(const Context& dev_ctx, ActivationGPUImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + +#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) @@ -58,6 +87,14 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor) + +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + CudaThresholdedReluFunctor, + threshold) + +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) } // namespace phi @@ -79,65 +116,29 @@ PD_REGISTER_KERNEL(relu, phi::dtype::float16, phi::dtype::bfloat16) {} #endif -PD_REGISTER_KERNEL( - sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos, - GPU, - ALL_LAYOUT, - phi::AcosKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin, - GPU, - ALL_LAYOUT, - phi::AsinKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan, - GPU, - ALL_LAYOUT, - phi::AtanKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh, - GPU, - ALL_LAYOUT, - phi::SinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh, - GPU, - ALL_LAYOUT, - phi::CoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh, - GPU, - ALL_LAYOUT, - phi::AsinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh, - GPU, - ALL_LAYOUT, - phi::AcoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh, - GPU, - ALL_LAYOUT, - phi::AtanhKernel, - float, - double, - phi::dtype::float16) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index 80e23d2b8e24b875fcc03bc0c1c149c0c13e3e41..a48a6226f23f8d9976dc86e59b051828b1d71b21 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -130,4 +130,76 @@ void ReluDoubleGradKernel(const Context& dev_ctx, relu_double_grad_functor); } +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout) { + funcs::LeakyReluGradGradFunctor leaky_relu_double_grad_functor; + leaky_relu_double_grad_functor.alpha = alpha; + ActivationDoubleGradImpl>( + dev_ctx, + &x, + nullptr, + &ddx, + nullptr, + nullptr, + ddout, + leaky_relu_double_grad_functor); +} + +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::TanhGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::TanhTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, // input + d_dout, + d_out_new, + d_ddx); // output +} + } // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 396830ca20765bc24d9ddc0e9d09ef045d376dfc..cbfca5b17ae995a89360c6d6d4987028d95dc281 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,40 +16,80 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } +#define comma , + +DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT +DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT +DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT +DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT +DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT +DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT +DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT +DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT +DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT +DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT +DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT +DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT +DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT +DefineActGradDepXOpArgMap(ThresholdedRelu, + "thresholded_relu", + "threshold"); // NOLINT + +DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT +DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT + KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); } -DefineActGradDepXOpArgMap(Cos, "cos"); -DefineActGradDepXOpArgMap(Tan, "tan"); -DefineActGradDepXOpArgMap(Acos, "acos"); -DefineActGradDepXOpArgMap(Sin, "sin"); -DefineActGradDepXOpArgMap(Asin, "asin"); -DefineActGradDepXOpArgMap(Atan, "atan"); -DefineActGradDepXOpArgMap(Sinh, "sinh"); -DefineActGradDepXOpArgMap(Cosh, "cosh"); -DefineActGradDepXOpArgMap(Asinh, "asinh"); -DefineActGradDepXOpArgMap(Acosh, "acosh"); -DefineActGradDepXOpArgMap(Atanh, "atanh"); -DefineActGradDepOutOpArgMap(Relu, "relu"); +KernelSignature TanhDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature TanhTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tanh_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + +KernelSignature LeakyReluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"}); +} + +KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -65,3 +105,16 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, + phi::TanhDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, + phi::TanhTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, + phi::LeakyReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, + phi::LeakyReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, + phi::ThresholdedReluGradOpArgumentMapping); diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index b8c4232076c50ea1f02d904c81c408efef59776b..0680e87b38b3f6c29e7f813474d947598912437d 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -58,8 +58,9 @@ def get_compat_kernels_info(): content += line if (registry and ";" in line): data = content.replace("\n", "").replace( - " ", "").strip("return").strip( - "KernelSignature(").strip("\);").replace("\"", "") + " ", + "").strip("return").strip("KernelSignature(").strip( + "\);").replace("\"", "").replace("\\", "") registry = False name, registry_info = parse_compat_registry(data)