[phi]add relu6 kernel and yaml (#43549)

* add relu6 kernel and yaml * format files * format code and fix bug * fix build failed

[phi]add relu6 kernel and yaml (#43549)
* add relu6 kernel and yaml * format files * format code and fix bug * fix build failed
a9bba5ba · chentianyu03 · GitHub · 2616d51a · a9bba5ba · a9bba5ba
15 changed file
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -56,4 +56,4 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 USE_OP_ITSELF(relu);
 USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
-USE_OP(relu6);
+USE_OP_ITSELF(relu6);
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1503,6 +1503,7 @@ REGISTER_ACTIVATION_OP(thresholded_relu,
                       ThresholdedRelu,
                       ThresholdedReluFunctor,
                       ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
 REGISTER_ACTIVATION_OP(hard_shrink,
                       HardShrink,
                       HardShrinkFunctor,

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -281,6 +281,7 @@ USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
 USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
 USE_PHI_FUNCTOR(BRelu)
 USE_PHI_FUNCTOR(ThresholdedRelu)
+USE_PHI_FUNCTOR(Relu6)
 USE_PHI_FUNCTOR(LeakyRelu)
 USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
 USE_PHI_FUNCTOR(HardShrink)
@@ -348,44 +349,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;

-// relu6(x) = min(max(0, x), 6)
-template <typename T>
-struct Relu6Functor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
-  }
-};
-
-template <typename T>
-struct Relu6GradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device,
-            typename X,
-            typename Out,
-            typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
-                   .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct SoftReluFunctor : public BaseActivationFunctor<T> {
  float threshold;
@@ -561,5 +524,4 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {

 #define FOR_EACH_ACTIVATION_OP(__macro)                               \
  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);  \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-
 #include "paddle/phi/kernels/funcs/activation_functor.h"

 namespace paddle {
@@ -67,42 +66,6 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

-template <typename T>
-struct CudaRelu6Functor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // relu6(x) = min(max(0, x), 6)
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return x <= zero ? zero : (x < t ? x : t);
-  }
-};
-
-template <typename T>
-struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = (out > 0 && out < t) ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    T t = static_cast<T>(threshold);
-    return (out > zero && out < t) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
  T one = static_cast<T>(1.0f);
@@ -144,8 +107,8 @@ class ActivationCudaKernel
    for (auto& attr : attrs) {
      *attr.second = ctx.Attr<float>(attr.first);
    }
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, functor);
  }
 };

@@ -158,8 +121,8 @@ class ActivationGradCudaKernel
    const framework::Tensor *x, *out, *d_out;
    framework::Tensor* d_x = nullptr;
    x = out = d_out = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(ctx, &x, &out, &d_out,
-                                                    &d_x);
+    ExtractActivationGradTensor<Functor::FwdDeps()>(
+        ctx, &x, &out, &d_out, &d_x);
    d_x->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto functor = Functor();
@@ -175,17 +138,17 @@ class ActivationGradCudaKernel
        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
      // Only need forward output Out
      ins.push_back(out);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, functor);
    } else if (static_cast<int>(Functor::FwdDeps()) ==
               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
      // Only need forward input X
      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, functor);
    } else {
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+          dev_ctx, ins, &outs, functor);
    }
  }
 };
@@ -205,6 +168,7 @@ USE_PHI_FUNCTOR(CudaTanh)
 USE_PHI_FUNCTOR(CudaBRelu)
 USE_PHI_FUNCTOR(CudaLeakyRelu)
 USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaRelu6)
 USE_PHI_FUNCTOR(CudaHardShrink)
 USE_PHI_FUNCTOR(CudaSoftShrink)
 USE_PHI_FUNCTOR(CudaTanhShrink)
@@ -252,10 +216,11 @@ using CudaELUGradNegativeAlphaFunctor =
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
-                                        grad_functor)                          \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(                               \
+    act_type, op_name, functor, grad_functor)                          \
  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+      act_type,                                                        \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<float>>,                  \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<double>>,                 \
@@ -274,10 +239,11 @@ namespace plat = paddle::platform;
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<plat::bfloat16>>);

-#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
-                                            grad_functor)                      \
+#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                           \
+    act_type, op_name, functor, grad_functor)                          \
  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+      act_type,                                                        \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<float>>,                  \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<double>>,                 \
@@ -305,7 +271,8 @@ namespace plat = paddle::platform;
                                    ops::grad_functor<plat::bfloat16>>);

 REGISTER_OP_CUDA_KERNEL(
-    relu6, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+    relu6,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<float>>,
    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<double>>,
@@ -318,7 +285,8 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ActivationCudaKernel<plat::CUDADeviceContext,
                              ops::CudaRelu6Functor<plat::bfloat16>>);
 REGISTER_OP_CUDA_KERNEL(
-    relu6_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    relu6_grad,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<float>>,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<double>>,
@@ -339,220 +307,324 @@ FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)

 #ifdef PADDLE_WITH_XPU_KP
 REGISTER_OP_KERNEL(
-    brelu, KP, plat::XPUPlace,
+    brelu,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaBReluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    brelu_grad, KP, plat::XPUPlace,
+    brelu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaBReluGradFunctor<float>>);

-REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(ceil,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaCeilFunctor<float>>);
 REGISTER_OP_KERNEL(
-    ceil_grad, KP, plat::XPUPlace,
+    ceil_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    celu, KP, plat::XPUPlace,
+    celu,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaCELUFunctor<float>>);
 REGISTER_OP_KERNEL(
-    celu_grad, KP, plat::XPUPlace,
+    celu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaCELUGradFunctor<float>>);

-REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(elu,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaELUFunctor<float>>);
 REGISTER_OP_KERNEL(
-    elu_grad, KP, plat::XPUPlace,
+    elu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaELUGradFunctor<float>>);

-REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(exp,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaExpFunctor<float>>);
 REGISTER_OP_KERNEL(
-    exp_grad, KP, plat::XPUPlace,
+    exp_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaExpGradFunctor<float>>);

-REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(floor,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaFloorFunctor<float>>);
 REGISTER_OP_KERNEL(
-    floor_grad, KP, plat::XPUPlace,
+    floor_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    hard_shrink, KP, plat::XPUPlace,
+    hard_shrink,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardShrinkFunctor<float>>);
 REGISTER_OP_KERNEL(
-    hard_shrink_grad, KP, plat::XPUPlace,
+    hard_shrink_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardShrinkGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    hard_sigmoid, KP, plat::XPUPlace,
+    hard_sigmoid,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardSigmoidFunctor<float>>);
 REGISTER_OP_KERNEL(
-    hard_sigmoid_grad, KP, plat::XPUPlace,
+    hard_sigmoid_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSigmoidGradFunctor<float>>);

-REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(hard_swish,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaHardSwishFunctor<float>>);
 REGISTER_OP_KERNEL(
-    hard_swish_grad, KP, plat::XPUPlace,
+    hard_swish_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSwishGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    leaky_relu, KP, plat::XPUPlace,
+    leaky_relu,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaLeakyReluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    leaky_relu_grad, KP, plat::XPUPlace,
+    leaky_relu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);

-REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(log,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLogFunctor<float>>);
 REGISTER_OP_KERNEL(
-    log_grad, KP, plat::XPUPlace,
+    log_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogGradFunctor<float>>);

-REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(log1p,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLog1pFunctor<float>>);
 REGISTER_OP_KERNEL(
-    log1p_grad, KP, plat::XPUPlace,
+    log1p_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLog1pGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    logsigmoid, KP, plat::XPUPlace,
+    logsigmoid,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaLogSigmoidFunctor<float>>);
 REGISTER_OP_KERNEL(
-    logsigmoid_grad, KP, plat::XPUPlace,
+    logsigmoid_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogSigmoidGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    reciprocal, KP, plat::XPUPlace,
+    reciprocal,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaReciprocalFunctor<float>>);
 REGISTER_OP_KERNEL(
-    reciprocal_grad, KP, plat::XPUPlace,
+    reciprocal_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaReciprocalGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    relu, KP, plat::XPUPlace,
+    relu,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaReluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    relu_grad, KP, plat::XPUPlace,
+    relu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaReluGradFunctor<float>>);

-REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(relu6,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaRelu6Functor<float>>);
 REGISTER_OP_KERNEL(
-    relu6_grad, KP, plat::XPUPlace,
+    relu6_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaRelu6GradFunctor<float>>);

-REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(sigmoid,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSigmoidFunctor<float>>);
 REGISTER_OP_KERNEL(
-    sigmoid_grad, KP, plat::XPUPlace,
+    sigmoid_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSigmoidGradFunctor<float>>);

-REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(silu,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSiluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    silu_grad, KP, plat::XPUPlace,
+    silu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSiluGradFunctor<float>>);

-REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(soft_relu,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftReluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    soft_relu_grad, KP, plat::XPUPlace,
+    soft_relu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftReluGradFunctor<float>>);

-REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(softplus,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftplusFunctor<float>>);
 REGISTER_OP_KERNEL(
-    softplus_grad, KP, plat::XPUPlace,
+    softplus_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftplusGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    softshrink, KP, plat::XPUPlace,
+    softshrink,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaSoftShrinkFunctor<float>>);
 REGISTER_OP_KERNEL(
-    softshrink_grad, KP, plat::XPUPlace,
+    softshrink_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftShrinkGradFunctor<float>>);

-REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(softsign,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftsignFunctor<float>>);
 REGISTER_OP_KERNEL(
-    softsign_grad, KP, plat::XPUPlace,
+    softsign_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftsignGradFunctor<float>>);

-REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(sqrt,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSqrtFunctor<float>>);
 REGISTER_OP_KERNEL(
-    sqrt_grad, KP, plat::XPUPlace,
+    sqrt_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSqrtGradFunctor<float>>);

-REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(square,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSquareFunctor<float>>);
 REGISTER_OP_KERNEL(
-    square_grad, KP, plat::XPUPlace,
+    square_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSquareGradFunctor<float>>);

-REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(swish,
+                   KP,
+                   plat::XPUPlace,
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSwishFunctor<float>>);
 REGISTER_OP_KERNEL(
-    swish_grad, KP, plat::XPUPlace,
+    swish_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSwishGradFunctor<float>>);

 REGISTER_OP_KERNEL(
-    thresholded_relu, KP, plat::XPUPlace,
+    thresholded_relu,
+    KP,
+    plat::XPUPlace,
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaThresholdedReluFunctor<float>>);
 REGISTER_OP_KERNEL(
-    thresholded_relu_grad, KP, plat::XPUPlace,
+    thresholded_relu_grad,
+    KP,
+    plat::XPUPlace,
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaThresholdedReluGradFunctor<float>>);


--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -75,6 +75,7 @@ DECLARE_ACTIVATION_KERNEL(Negative)

 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)

--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -156,6 +156,9 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
                                               ThresholdedReluGradFunctor,
                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
+                                                 Relu6GradFunctor,
+                                                 threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                               SoftShrinkGradFunctor,
                                               lambda);
@@ -263,6 +266,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -95,6 +95,7 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     ThresholdedReluFunctor,
                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, Relu6Functor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
@@ -147,6 +148,7 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)

--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1097,6 +1097,44 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
+  }
+};
+
+template <typename T>
+struct Relu6GradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+                   .template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 // tanhshrink(x) = x - tanh(x)
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -2712,6 +2750,41 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };

+template <typename T>
+struct CudaRelu6Functor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // relu6(x) = min(max(0, x), 6)
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t = static_cast<T>(threshold);
+    return x <= zero ? zero : (x < t ? x : t);
+  }
+};
+
+template <typename T>
+struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (out > 0 && out < t) ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    T t = static_cast<T>(threshold);
+    return (out > zero && out < t) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
 template <typename T>
 struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
  T zero = static_cast<T>(0.0f);

--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -223,6 +223,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
                                               CudaCELUGradFunctor,
                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
+                                                 CudaRelu6GradFunctor,
+                                                 threshold);

 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                               CudaBReluGradFunctor,
@@ -348,6 +351,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
                                   LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -111,6 +111,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                     CudaThresholdedReluFunctor,
                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, CudaRelu6Functor, threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                     CudaHardShrinkFunctor,
                                     threshold)
@@ -192,6 +193,7 @@ PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)

--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -90,6 +90,7 @@ DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", );            // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", );   // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", );               // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", );             // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold");  // NOLINT

 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
                                 "hard_sigmoid",
@@ -282,6 +283,7 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
                           phi::LeakyReluDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
                           phi::ThresholdedReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
                           phi::SoftShrinkGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1737,6 +1737,7 @@ class TestRelu6(TestActivation):
    def setUp(self):
        self.op_type = "relu6"
        self.init_dtype()
+        self.python_api = paddle.nn.functional.relu6

        np.random.seed(1024)
        x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
@@ -1750,7 +1751,7 @@ class TestRelu6(TestActivation):
    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)


 class TestRelu6API(unittest.TestCase):

--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -860,6 +860,8 @@ def relu6(x, name=None):
            out = F.relu6(x) # [0, 0.3, 6]
    """
    threshold = 6.0
+    if in_dygraph_mode():
+        return _C_ops.final_state_relu6(x, threshold)
    if in_dynamic_mode():
        return _C_ops.relu6(x, 'threshold', threshold)


--- a/python/paddle/utils/code_gen/legacy_api.yaml
+++ b/python/paddle/utils/code_gen/legacy_api.yaml
@@ -1713,6 +1713,16 @@
  inplace : (x -> out)
  backward : relu_grad

+- api : relu6
+  args : (Tensor x, float threshold)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : relu6
+  backward : relu6_grad
+
 - api : reshape
  args : (Tensor x, IntArray shape)
  output : Tensor(out), Tensor(xshape)

--- a/python/paddle/utils/code_gen/legacy_backward.yaml
+++ b/python/paddle/utils/code_gen/legacy_backward.yaml
@@ -1565,6 +1565,17 @@
  kernel :
    func : prod_grad

+- backward_api : relu6_grad
+  forward : relu6 (Tensor x, float threshold) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad, float threshold)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
+  kernel :
+    func : relu6_grad
+  inplace : (out_grad -> x_grad)
+
 - backward_api : relu_double_grad
  forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
  args : (Tensor out, Tensor grad_x_grad)