Delete hardswish_raw op (#51634)

* Delete hardswish_raw op * fix ut

Delete hardswish_raw op (#51634)
* Delete hardswish_raw op * fix ut
3e636ec9 · zhangyuqin1998 · GitHub · bcec0dce · 3e636ec9 · 3e636ec9
14 changed file
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -532,7 +532,7 @@

 - backward_op : hardswish_grad
  forward : hardswish (Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float threshold = 6.0, float scale = 6.0, float offset = 3.0)
+  args : (Tensor x, Tensor out_grad)
  output : Tensor(x_grad)
  infer_meta :
    func : UnchangedInferMeta

--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -225,9 +225,6 @@ template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& dout,
-                         float threshold,
-                         float scale,
-                         float offset,
                         DenseTensor* dx);

 template <typename T, typename Context>

--- a/paddle/phi/kernels/activation_kernel.cc
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -19,13 +19,6 @@

 namespace phi {

-template <typename T, typename Context>
-void HardSwishKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     DenseTensor* out) {
-  HardSwishRawKernel<T, Context>(dev_ctx, x, 6, 6, 3, out);
-}
-
 template <typename T, typename Context>
 void Relu6Kernel(const Context& dev_ctx,
                 const DenseTensor& x,
@@ -44,21 +37,10 @@ void SwishKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;

-PD_REGISTER_KERNEL(
-    hardswish, CPU, ALL_LAYOUT, phi::HardSwishKernel, float, double) {}
 PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
 PD_REGISTER_KERNEL(swish, CPU, ALL_LAYOUT, phi::SwishKernel, float, double) {}

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(hardswish,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::HardSwishKernel,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_KERNEL(relu6,
                   GPU,
                   ALL_LAYOUT,
@@ -80,18 +62,11 @@ PD_REGISTER_KERNEL(swish,
 #endif

 #if defined PADDLE_WITH_XPU
-PD_REGISTER_KERNEL(hardswish, XPU, ALL_LAYOUT, phi::HardSwishKernel, float) {}
 PD_REGISTER_KERNEL(relu6, XPU, ALL_LAYOUT, phi::Relu6Kernel, float) {}
 PD_REGISTER_KERNEL(swish, XPU, ALL_LAYOUT, phi::SwishKernel, float) {}
 #endif

 #ifdef PADDLE_WITH_MKLDNN
-PD_REGISTER_KERNEL(hardswish,
-                   OneDNN,
-                   ONEDNN,
-                   phi::HardSwishKernel,
-                   float,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(
    relu6, OneDNN, ONEDNN, phi::Relu6Kernel, float, phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -90,14 +90,6 @@ DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)

-template <typename T, typename Context>
-void HardSwishRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        float threshold,
-                        float scale,
-                        float offset,
-                        DenseTensor* out);
-
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
                     const DenseTensor& x,

--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -226,11 +226,11 @@ template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& dout,
-                         float threshold,
-                         float scale,
-                         float offset,
                         DenseTensor* dx) {
  funcs::HardSwishGradFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;

--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -113,13 +113,13 @@ DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                     offset)

 template <typename T, typename Context>
-void HardSwishRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        float threshold,
-                        float scale,
-                        float offset,
-                        DenseTensor* out) {
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
  funcs::HardSwishFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;
@@ -183,7 +183,7 @@ PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
 PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hardswish_raw, HardSwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)

--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -274,11 +274,11 @@ template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& dout,
-                         float threshold,
-                         float scale,
-                         float offset,
                         DenseTensor* dx) {
  funcs::CudaHardSwishGradFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -138,13 +138,13 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Selu, CudaSeluFunctor, scale, alpha)

 template <typename T, typename Context>
-void HardSwishRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        float threshold,
-                        float scale,
-                        float offset,
-                        DenseTensor* out) {
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
  funcs::CudaHardSwishFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;
@@ -257,7 +257,7 @@ PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
 PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hardswish_raw, HardSwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)

--- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
@@ -238,9 +238,6 @@ template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& dout,
-                         float threshold,
-                         float scale,
-                         float offset,
                         DenseTensor* dx) {
  HardSwishOneDNNGradFunctor<T> functor;
  functor(dev_ctx, x, dout, 0, 0, dx);

--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
@@ -157,14 +157,11 @@ DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishOneDNNFunctor, threshold)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(SwishRaw, SwishOneDNNFunctor, beta)

 template <typename T, typename Context>
-void HardSwishRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        float threshold,
-                        float scale,
-                        float offset,
-                        DenseTensor* out) {
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
  HardSwishOneDNNFunctor<T> functor;
-  functor(dev_ctx, x, threshold, 0, out);
+  functor(dev_ctx, x, 6, 0, out);
 }

 template <typename T, typename Context>
@@ -202,7 +199,7 @@ PD_REGISTER_ACTIVATION_KERNEL(abs, AbsKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)
 PD_REGISTER_ACTIVATION_KERNEL(gelu, GeluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hardswish_raw, HardSwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(relu, ReluKernel)

--- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -603,11 +603,11 @@ template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& dout,
-                         float threshold,
-                         float scale,
-                         float offset,
                         DenseTensor* dx) {
  XPUHardSwishGradFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;

--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -513,13 +513,13 @@ DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                            offset)

 template <typename T, typename Context>
-void HardSwishRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        float threshold,
-                        float scale,
-                        float offset,
-                        DenseTensor* out) {
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
  XPUHardSwishFunctor<T> functor;
+  float threshold = 6;
+  float scale = 6;
+  float offset = 3;
  auto attrs = functor.GetAttrs();
  *(attrs[0].second) = threshold;
  *(attrs[1].second) = scale;
@@ -551,7 +551,7 @@ PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)  // no grad
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hardswish_raw, HardSwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(pow, PowKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)

--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -41,10 +41,6 @@ namespace phi {

 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hardtanh", "t_min" comma "t_max");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish,
-                               "hardswish",
-                               "threshold" comma "scale" comma
-                               "offset");                // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta");  // NOLINT

 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,
@@ -53,9 +49,13 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,

 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold");  // NOLINT

+KernelSignature HardSwishGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("hardswish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+
 KernelSignature HardSwishOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "hardswish_raw", {"X"}, {"threshold", "scale", "offset"}, {"Out"});
+  return KernelSignature("hardswish", {"X"}, {}, {"Out"});
 }

 KernelSignature SwishOpArgumentMapping(const ArgumentMappingContext& ctx) {

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
@@ -40,9 +40,9 @@ class TrtConvertHardSwishTest(TrtLayerAutoScanTest):
        def generate_input1(attrs: List[Dict[str, Any]]):
            return np.ones([1, 3, 32, 32]).astype(np.float32)

-        for threshold in [6.0, 7.0, 100.0, 0.0, -1.0]:
-            for scale in [5.0, 7.0, -1.0, 0.0, 100.0]:
-                for offset in [3.0, 5.0, -1.0, 0.0, 100.0]:
+        for threshold in [6.0]:
+            for scale in [6.0]:
+                for offset in [3.0]:
                    dics = [
                        {
                            "threshold": threshold,