未验证 提交 a9bba5ba 编写于 作者: C chentianyu03 提交者: GitHub

[phi]add relu6 kernel and yaml (#43549)

* add relu6 kernel and yaml

* format files

* format code and fix bug

* fix build failed
上级 2616d51a
......@@ -56,4 +56,4 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
USE_OP_ITSELF(relu);
USE_OP_ITSELF(sigmoid);
USE_OP_ITSELF(tanh);
USE_OP(relu6);
USE_OP_ITSELF(relu6);
......@@ -1503,6 +1503,7 @@ REGISTER_ACTIVATION_OP(thresholded_relu,
ThresholdedRelu,
ThresholdedReluFunctor,
ThresholdedReluGradFunctor);
REGISTER_ACTIVATION_OP(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
REGISTER_ACTIVATION_OP(hard_shrink,
HardShrink,
HardShrinkFunctor,
......
......@@ -281,6 +281,7 @@ USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
USE_PHI_FUNCTOR(BRelu)
USE_PHI_FUNCTOR(ThresholdedRelu)
USE_PHI_FUNCTOR(Relu6)
USE_PHI_FUNCTOR(LeakyRelu)
USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
USE_PHI_FUNCTOR(HardShrink)
......@@ -348,44 +349,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
template <typename T>
using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
// relu6(x) = min(max(0, x), 6)
template <typename T>
struct Relu6Functor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
}
};
template <typename T>
struct Relu6GradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
.template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct SoftReluFunctor : public BaseActivationFunctor<T> {
float threshold;
......@@ -561,5 +524,4 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
#define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);
......@@ -75,6 +75,7 @@ DECLARE_ACTIVATION_KERNEL(Negative)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
......
......@@ -156,6 +156,9 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
ThresholdedReluGradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
Relu6GradFunctor,
threshold);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
SoftShrinkGradFunctor,
lambda);
......@@ -263,6 +266,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
......
......@@ -95,6 +95,7 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
ThresholdedReluFunctor,
threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, Relu6Functor, threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
......@@ -147,6 +148,7 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
......
......@@ -1097,6 +1097,44 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
// relu6(x) = min(max(0, x), 6)
template <typename T>
struct Relu6Functor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) =
x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
}
};
template <typename T>
struct Relu6GradFunctor : public BaseActivationFunctor<T> {
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
template <typename Device,
typename X,
typename Out,
typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
.template cast<T>();
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
......@@ -2712,6 +2750,41 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaRelu6Functor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// relu6(x) = min(max(0, x), 6)
__device__ __forceinline__ T operator()(const T x) const {
T t = static_cast<T>(threshold);
return x <= zero ? zero : (x < t ? x : t);
}
};
template <typename T>
struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
// dx = (out > 0 && out < t) ? dout : 0
__device__ __forceinline__ T operator()(const T dout, const T out) const {
T t = static_cast<T>(threshold);
return (out > zero && out < t) ? dout : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
......
......@@ -223,6 +223,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
CudaCELUGradFunctor,
alpha);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
CudaRelu6GradFunctor,
threshold);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
CudaBReluGradFunctor,
......@@ -348,6 +351,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
LeakyReluDoubleGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
ThresholdedReluGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
......
......@@ -111,6 +111,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
CudaThresholdedReluFunctor,
threshold)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, CudaRelu6Functor, threshold)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
CudaHardShrinkFunctor,
threshold)
......@@ -192,6 +193,7 @@ PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
......
......@@ -82,14 +82,15 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus,
"softplus",
"beta" comma "threshold"); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Exp, "exp", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", ); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold"); // NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
"hard_sigmoid",
......@@ -282,6 +283,7 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
phi::LeakyReluDoubleGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
phi::ThresholdedReluGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
phi::SoftShrinkGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
......
......@@ -1737,6 +1737,7 @@ class TestRelu6(TestActivation):
def setUp(self):
self.op_type = "relu6"
self.init_dtype()
self.python_api = paddle.nn.functional.relu6
np.random.seed(1024)
x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
......@@ -1750,7 +1751,7 @@ class TestRelu6(TestActivation):
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out')
self.check_grad(['X'], 'Out', check_eager=True)
class TestRelu6API(unittest.TestCase):
......
......@@ -860,6 +860,8 @@ def relu6(x, name=None):
out = F.relu6(x) # [0, 0.3, 6]
"""
threshold = 6.0
if in_dygraph_mode():
return _C_ops.final_state_relu6(x, threshold)
if in_dynamic_mode():
return _C_ops.relu6(x, 'threshold', threshold)
......
......@@ -1713,6 +1713,16 @@
inplace : (x -> out)
backward : relu_grad
- api : relu6
args : (Tensor x, float threshold)
output : Tensor
infer_meta :
func : UnchangedInferMeta
param : [x]
kernel :
func : relu6
backward : relu6_grad
- api : reshape
args : (Tensor x, IntArray shape)
output : Tensor(out), Tensor(xshape)
......
......@@ -1565,6 +1565,17 @@
kernel :
func : prod_grad
- backward_api : relu6_grad
forward : relu6 (Tensor x, float threshold) -> Tensor(out)
args : (Tensor out, Tensor out_grad, float threshold)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
param : [out]
kernel :
func : relu6_grad
inplace : (out_grad -> x_grad)
- backward_api : relu_double_grad
forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
args : (Tensor out, Tensor grad_x_grad)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册