未验证 提交 54e4360a 编写于 作者: Z zhangyuqin1998 提交者: GitHub

delete unused param from swish_grad and relu6_grad (#52805)

上级 b1bb7484
...@@ -801,7 +801,7 @@ ...@@ -801,7 +801,7 @@
- backward_op : relu6_grad - backward_op : relu6_grad
forward : relu6 (Tensor x) -> Tensor(out) forward : relu6 (Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad, float threshold = 6) args : (Tensor out, Tensor out_grad)
output : Tensor(x_grad) output : Tensor(x_grad)
infer_meta : infer_meta :
func : UnchangedInferMeta func : UnchangedInferMeta
...@@ -1010,7 +1010,7 @@ ...@@ -1010,7 +1010,7 @@
- backward_op : swish_grad - backward_op : swish_grad
forward : swish (Tensor x) -> Tensor(out) forward : swish (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad, float bete=1.0) args : (Tensor x, Tensor out_grad)
output : Tensor(x_grad) output : Tensor(x_grad)
infer_meta : infer_meta :
func : GeneralUnaryGradInferMeta func : GeneralUnaryGradInferMeta
......
...@@ -252,7 +252,7 @@ ...@@ -252,7 +252,7 @@
- backward_op : relu6_grad - backward_op : relu6_grad
forward : relu6(Tensor x) -> Tensor(out) forward : relu6(Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad, float threshold = 6) args : (Tensor out, Tensor out_grad)
output : Tensor(x_grad) output : Tensor(x_grad)
infer_meta : infer_meta :
func : UnchangedInferMeta func : UnchangedInferMeta
......
...@@ -285,6 +285,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log); ...@@ -285,6 +285,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Swish);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1);
...@@ -294,6 +295,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu); ...@@ -294,6 +295,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6);
DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round); DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor); DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
...@@ -303,11 +305,9 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha); ...@@ -303,11 +305,9 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
......
...@@ -136,12 +136,14 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor); ...@@ -136,12 +136,14 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, Relu6GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, SoftsignGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, SoftsignGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, SwishGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
...@@ -157,16 +159,12 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, ...@@ -157,16 +159,12 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
ThresholdedReluGradFunctor, ThresholdedReluGradFunctor,
threshold); threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
Relu6GradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
SoftShrinkGradFunctor, SoftShrinkGradFunctor,
lambda); lambda);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
HardShrinkGradFunctor, HardShrinkGradFunctor,
threshold); threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
MishGradFunctor, MishGradFunctor,
......
...@@ -1505,16 +1505,14 @@ struct Relu6Functor : public BaseActivationFunctor<T> { ...@@ -1505,16 +1505,14 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
template <typename T> template <typename T>
struct Relu6GradFunctor : public BaseActivationFunctor<T> { struct Relu6GradFunctor : public BaseActivationFunctor<T> {
float threshold; typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, template <typename Device,
typename X, typename X,
typename Out, typename Out,
typename dOut, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
float threshold = 6;
dx.device(d) = dx.device(d) =
dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold))) dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
.template cast<T>(); .template cast<T>();
...@@ -2188,10 +2186,7 @@ struct SwishFunctor : public BaseActivationFunctor<T> { ...@@ -2188,10 +2186,7 @@ struct SwishFunctor : public BaseActivationFunctor<T> {
template <typename T> template <typename T>
struct SwishGradFunctor : public BaseActivationFunctor<T> { struct SwishGradFunctor : public BaseActivationFunctor<T> {
float beta; typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}};
}
template <typename Device, template <typename Device,
typename X, typename X,
...@@ -2199,6 +2194,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -2199,6 +2194,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
typename dOut, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const { void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
float beta = 1.0;
auto temp1 = static_cast<T>(1) / auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp()); (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto out = x * temp1; auto out = x * temp1;
...@@ -3285,14 +3281,12 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> { ...@@ -3285,14 +3281,12 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
template <typename T> template <typename T>
struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> { struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f); T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
return {{"threshold", &threshold}};
}
// dx = (out > 0 && out < t) ? dout : 0 // dx = (out > 0 && out < t) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T out) const { __device__ __forceinline__ T operator()(const T dout, const T out) const {
float threshold = 6;
T t = static_cast<T>(threshold); T t = static_cast<T>(threshold);
return (out > zero && out < t) ? dout : zero; return (out > zero && out < t) ? dout : zero;
} }
...@@ -3781,15 +3775,13 @@ template <typename T> ...@@ -3781,15 +3775,13 @@ template <typename T>
struct CudaSwishGradFunctor : public BaseActivationFunctor<T> { struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type; using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f); MPType one = static_cast<MPType>(1.0f);
float beta;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
return {{"beta", &beta}};
}
// dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2) // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
__device__ __forceinline__ T operator()(const T arg_dout, __device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const { const T arg_x) const {
float beta = 1.0;
MPType dout = static_cast<MPType>(arg_dout); MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x); MPType x = static_cast<MPType>(arg_x);
MPType b = static_cast<MPType>(beta); MPType b = static_cast<MPType>(beta);
......
...@@ -198,12 +198,14 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor); ...@@ -198,12 +198,14 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, CudaRelu6GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
CudaLeakyReluGradFunctor, CudaLeakyReluGradFunctor,
...@@ -217,9 +219,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, ...@@ -217,9 +219,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
CudaHardShrinkGradFunctor, CudaHardShrinkGradFunctor,
threshold); threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
CudaSwishGradFunctor,
beta);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
CudaMishGradFunctor, CudaMishGradFunctor,
...@@ -227,9 +226,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, ...@@ -227,9 +226,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
CudaCELUGradFunctor, CudaCELUGradFunctor,
alpha); alpha);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
CudaRelu6GradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA, DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
CudaLogitGradFunctor, CudaLogitGradFunctor,
eps); eps);
......
...@@ -204,9 +204,16 @@ DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, ...@@ -204,9 +204,16 @@ DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
MishOneDNNGradFunctor, MishOneDNNGradFunctor,
threshold); threshold);
DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
SwishOneDNNGradFunctor, template <typename T, typename Context>
beta); void SwishGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
DenseTensor* dx) {
SwishOneDNNGradFunctor<T> functor;
float beta = 1.0;
functor(dev_ctx, x, dout, beta, 0, dx);
}
template <typename T, typename Context> template <typename T, typename Context>
void EluGradKernel(const Context& dev_ctx, void EluGradKernel(const Context& dev_ctx,
...@@ -247,9 +254,9 @@ template <typename T, typename Context> ...@@ -247,9 +254,9 @@ template <typename T, typename Context>
void Relu6GradKernel(const Context& dev_ctx, void Relu6GradKernel(const Context& dev_ctx,
const DenseTensor& out, const DenseTensor& out,
const DenseTensor& dout, const DenseTensor& dout,
float threshold,
DenseTensor* dx) { DenseTensor* dx) {
Relu6OneDNNGradUseOutFunctor<T> functor; Relu6OneDNNGradUseOutFunctor<T> functor;
float threshold = 6;
functor(dev_ctx, out, dout, 0, threshold, dx); functor(dev_ctx, out, dout, 0, threshold, dx);
} }
......
...@@ -94,9 +94,9 @@ DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p) ...@@ -94,9 +94,9 @@ DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu) DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs) DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Expm1) DEFINE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor) DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha) DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
template <typename T, typename Context> template <typename T, typename Context>
void CastCooGradKernel(const Context& dev_ctx, void CastCooGradKernel(const Context& dev_ctx,
......
...@@ -62,9 +62,9 @@ DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt) ...@@ -62,9 +62,9 @@ DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p) DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs) DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Expm1) DECLARE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor) DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha) DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
template <typename T, typename Context> template <typename T, typename Context>
void CastCooGradKernel(const Context& dev_ctx, void CastCooGradKernel(const Context& dev_ctx,
......
...@@ -351,10 +351,7 @@ struct XPUReluGradFunctor : public funcs::BaseActivationFunctor<T> { ...@@ -351,10 +351,7 @@ struct XPUReluGradFunctor : public funcs::BaseActivationFunctor<T> {
template <typename T> template <typename T>
struct XPURelu6GradFunctor : public funcs::BaseActivationFunctor<T> { struct XPURelu6GradFunctor : public funcs::BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type; using XPUType = typename XPUTypeTrait<T>::Type;
float threshold; typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Context> template <typename Context>
void operator()(const Context& dev_ctx, void operator()(const Context& dev_ctx,
const DenseTensor* x, const DenseTensor* x,
...@@ -481,10 +478,7 @@ void PowGradKernel(const Context& dev_ctx, ...@@ -481,10 +478,7 @@ void PowGradKernel(const Context& dev_ctx,
template <typename T> template <typename T>
struct XPUSwishGradFunctor : public funcs::BaseActivationFunctor<T> { struct XPUSwishGradFunctor : public funcs::BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type; using XPUType = typename XPUTypeTrait<T>::Type;
float beta; typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}};
}
template <typename Context> template <typename Context>
void operator()(const Context& dev_ctx, void operator()(const Context& dev_ctx,
...@@ -571,14 +565,13 @@ DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor); ...@@ -571,14 +565,13 @@ DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, XPURelu6GradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, XPUSiluGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, XPUSiluGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, XPUSwishGradFunctor);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
XPUSwishGradFunctor,
beta);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
XPUMishGradFunctor, XPUMishGradFunctor,
threshold); threshold);
...@@ -586,10 +579,6 @@ DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, ...@@ -586,10 +579,6 @@ DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
XPULeakyReluGradFunctor, XPULeakyReluGradFunctor,
alpha); alpha);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
XPURelu6GradFunctor,
threshold);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
XPUSoftPlusGradFunctor, XPUSoftPlusGradFunctor,
beta, beta,
......
...@@ -41,8 +41,14 @@ namespace phi { ...@@ -41,8 +41,14 @@ namespace phi {
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hardtanh", "t_min" comma "t_max"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hardtanh", "t_min" comma "t_max");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta"); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold"); // NOLINT KernelSignature SwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("swish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
}
KernelSignature Relu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("relu6_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"});
}
KernelSignature HardSwishGradOpArgumentMapping( KernelSignature HardSwishGradOpArgumentMapping(
const ArgumentMappingContext& ctx) { const ArgumentMappingContext& ctx) {
......
...@@ -16,7 +16,6 @@ import unittest ...@@ -16,7 +16,6 @@ import unittest
import numpy as np import numpy as np
from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
from scipy.special import expit
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
...@@ -416,30 +415,6 @@ class TestMKLDNNAbsDim4(TestAbs): ...@@ -416,30 +415,6 @@ class TestMKLDNNAbsDim4(TestAbs):
self.dtype = np.float32 self.dtype = np.float32
class TestMKLDNNSwishDim4(TestSwish):
def setUp(self):
super().setUp()
x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(self.dtype)
beta = 2.3
out = x * expit(beta * x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True, "beta": beta}
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output(check_dygraph=False)
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', check_dygraph=False)
def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0): def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
x_dtype = x.dtype x_dtype = x.dtype
if x_dtype == 'float16': if x_dtype == 'float16':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册