未验证 提交 54e4360a 编写于 作者: Z zhangyuqin1998 提交者: GitHub

delete unused param from swish_grad and relu6_grad (#52805)

上级 b1bb7484
......@@ -801,7 +801,7 @@
- backward_op : relu6_grad
forward : relu6 (Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad, float threshold = 6)
args : (Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
......@@ -1010,7 +1010,7 @@
- backward_op : swish_grad
forward : swish (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad, float bete=1.0)
args : (Tensor x, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
......
......@@ -252,7 +252,7 @@
- backward_op : relu6_grad
forward : relu6(Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad, float threshold = 6)
args : (Tensor out, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
......
......@@ -285,6 +285,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Swish);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1);
......@@ -294,6 +295,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6);
DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
......@@ -303,11 +305,9 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6, threshold);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
......
......@@ -136,12 +136,14 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, Relu6GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, SoftsignGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, SwishGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
......@@ -157,16 +159,12 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
ThresholdedReluGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
Relu6GradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
SoftShrinkGradFunctor,
lambda);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
HardShrinkGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
MishGradFunctor,
......
......@@ -1505,16 +1505,14 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
template <typename T>
struct Relu6GradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
float threshold = 6;
dx.device(d) =
dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
.template cast<T>();
......@@ -2188,10 +2186,7 @@ struct SwishFunctor : public BaseActivationFunctor<T> {
template <typename T>
struct SwishGradFunctor : public BaseActivationFunctor<T> {
float beta;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}};
}
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
template <typename Device,
typename X,
......@@ -2199,6 +2194,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
typename dOut,
typename dX>
void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
float beta = 1.0;
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto out = x * temp1;
......@@ -3285,14 +3281,12 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
template <typename T>
struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
// dx = (out > 0 && out < t) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T out) const {
float threshold = 6;
T t = static_cast<T>(threshold);
return (out > zero && out < t) ? dout : zero;
}
......@@ -3781,15 +3775,13 @@ template <typename T>
struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
float beta;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}};
}
typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
// dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
float beta = 1.0;
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType b = static_cast<MPType>(beta);
......
......@@ -198,12 +198,14 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, CudaRelu6GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
CudaLeakyReluGradFunctor,
......@@ -217,9 +219,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
CudaHardShrinkGradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
CudaSwishGradFunctor,
beta);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
CudaMishGradFunctor,
......@@ -227,9 +226,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
CudaCELUGradFunctor,
alpha);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
CudaRelu6GradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
CudaLogitGradFunctor,
eps);
......
......@@ -204,9 +204,16 @@ DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
MishOneDNNGradFunctor,
threshold);
DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
SwishOneDNNGradFunctor,
beta);
template <typename T, typename Context>
void SwishGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
DenseTensor* dx) {
SwishOneDNNGradFunctor<T> functor;
float beta = 1.0;
functor(dev_ctx, x, dout, beta, 0, dx);
}
template <typename T, typename Context>
void EluGradKernel(const Context& dev_ctx,
......@@ -247,9 +254,9 @@ template <typename T, typename Context>
void Relu6GradKernel(const Context& dev_ctx,
const DenseTensor& out,
const DenseTensor& dout,
float threshold,
DenseTensor* dx) {
Relu6OneDNNGradUseOutFunctor<T> functor;
float threshold = 6;
functor(dev_ctx, out, dout, 0, threshold, dx);
}
......
......@@ -94,9 +94,9 @@ DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
template <typename T, typename Context>
void CastCooGradKernel(const Context& dev_ctx,
......
......@@ -62,9 +62,9 @@ DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
template <typename T, typename Context>
void CastCooGradKernel(const Context& dev_ctx,
......
......@@ -351,10 +351,7 @@ struct XPUReluGradFunctor : public funcs::BaseActivationFunctor<T> {
template <typename T>
struct XPURelu6GradFunctor : public funcs::BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
float threshold;
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
template <typename Context>
void operator()(const Context& dev_ctx,
const DenseTensor* x,
......@@ -481,10 +478,7 @@ void PowGradKernel(const Context& dev_ctx,
template <typename T>
struct XPUSwishGradFunctor : public funcs::BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
float beta;
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}};
}
typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
template <typename Context>
void operator()(const Context& dev_ctx,
......@@ -571,14 +565,13 @@ DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, XPURelu6GradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, XPUSiluGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor);
DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, XPUSwishGradFunctor);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
XPUSwishGradFunctor,
beta);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
XPUMishGradFunctor,
threshold);
......@@ -586,10 +579,6 @@ DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
XPULeakyReluGradFunctor,
alpha);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
XPURelu6GradFunctor,
threshold);
DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
XPUSoftPlusGradFunctor,
beta,
......
......@@ -41,8 +41,14 @@ namespace phi {
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hardtanh", "t_min" comma "t_max");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta"); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold"); // NOLINT
KernelSignature SwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("swish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
}
KernelSignature Relu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("relu6_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"});
}
KernelSignature HardSwishGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
......
......@@ -16,7 +16,6 @@ import unittest
import numpy as np
from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
from scipy.special import expit
import paddle
import paddle.nn.functional as F
......@@ -416,30 +415,6 @@ class TestMKLDNNAbsDim4(TestAbs):
self.dtype = np.float32
class TestMKLDNNSwishDim4(TestSwish):
def setUp(self):
super().setUp()
x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(self.dtype)
beta = 2.3
out = x * expit(beta * x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True, "beta": beta}
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output(check_dygraph=False)
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', check_dygraph=False)
def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
x_dtype = x.dtype
if x_dtype == 'float16':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册