diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index dd315dcd0a32d1ae32dcacb8ad3c45de0923ff06..ba580f4097e3324e6dc1543dc4a9be70cdc638e7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -100,7 +100,7 @@ class CompareOp : public framework::OperatorWithKernel {
   char _##op_type##Comment::equation[]{_equation};                       \
   DECLARE_INFER_SHAPE_FUNCTOR(op_type,                                   \
                               op_type##_InferShapeFunctor,               \
-                              PD_INFER_META(phi::CompareInferMeta));     \
+                              PD_INFER_META(phi::CompareRawInferMeta));  \
   REGISTER_OPERATOR(                                                     \
       op_type,                                                           \
       ::paddle::operators::CompareOp<_##op_type##Comment>,               \
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 737c6ccc69fb19b8f52dfb290ef0df0391f9dcbb..1fef16fc23462d4cdc1274778bdd7a774ae3a454 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -400,7 +400,6 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
     grad_op->SetInput("{{attr_name | to_pascal_case}}Tensor", this->Input("{{attr_name | to_pascal_case}}Tensor"));
       {% endif %}
     {% else %}{# maybe something wrong: backward op has more attrs than the forward one#}
-    grad_op->AddAttr<{{attr["typename"] | to_op_attr_type}}>({{attr_name}}, "({{attr["typename"] | to_op_attr_type}}), exceptional attr {{attr_name}}");
     grad_op->SetAttr("{{attr_name}}", {{process_default_value(attr)}});
     {% endif %}
   {% endfor %}
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 4276c5ca0e4ded3a3ef2d8141814e160cc8298d9..6c7d974e7042226c8c87847ba36f8a0afb931202 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -841,7 +841,7 @@ static PyObject* tensor__gt__method(TensorObject* self,
   VLOG(6) << "Calling greater_than_ad_func in tensor__gt__method";
   {
     eager_gil_scoped_release guard;
-    ret = greater_than_ad_func(self_tensor, other_tensor, -1);
+    ret = greater_than_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
@@ -927,7 +927,7 @@ static PyObject* tensor__ge__method(TensorObject* self,
   VLOG(6) << "Calling greater_equal_ad_func in tensor__ge__method";
   {
     eager_gil_scoped_release guard;
-    ret = greater_equal_ad_func(self_tensor, other_tensor, -1);
+    ret = greater_equal_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
@@ -1204,7 +1204,7 @@ static PyObject* tensor__lt__method(TensorObject* self,
   VLOG(6) << "Calling less_than_ad_func in tensor__lt__method";
   {
     eager_gil_scoped_release guard;
-    ret = less_than_ad_func(self_tensor, other_tensor, -1);
+    ret = less_than_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
@@ -1290,7 +1290,7 @@ static PyObject* tensor__le__method(TensorObject* self,
   VLOG(6) << "Calling less_equal_ad_func in tensor__le__method";
   {
     eager_gil_scoped_release guard;
-    ret = less_equal_ad_func(self_tensor, other_tensor, -1);
+    ret = less_equal_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
@@ -1636,7 +1636,7 @@ static PyObject* tensor__ne__method(TensorObject* self,
   VLOG(6) << "Calling not_equal_ad_func in tensor__ne__method";
   {
     eager_gil_scoped_release guard;
-    ret = not_equal_ad_func(self_tensor, other_tensor, -1);
+    ret = not_equal_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
@@ -1722,7 +1722,7 @@ static PyObject* tensor__eq__method(TensorObject* self,
   VLOG(6) << "Calling equal_ad_func in tensor__eq__method";
   {
     eager_gil_scoped_release guard;
-    ret = equal_ad_func(self_tensor, other_tensor, -1);
+    ret = equal_ad_func(self_tensor, other_tensor);
   }
 
   return ToPyObject(ret);
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 0cf0555f9ef3e392e92f90389388c184d0a74e7e..4b9920f2dce24448139c3b4c08d6b57809a23ee5 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -67,7 +67,7 @@
     func : addmm_grad
 
 - backward_op : affine_grid_grad
-  forward : affine_grid (Tensor input, IntArray outputShape, bool use_cudnn=true, bool align_corners=true) -> Tensor(output)
+  forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) -> Tensor(output)
   args : (Tensor output_grad, IntArray outputShape, bool use_cudnn=true, bool align_corners=true)
   output : Tensor(input_grad)
   infer_meta :
@@ -577,8 +577,8 @@
   inplace : (out_grad -> x_grad)
 
 - backward_op : fmax_grad
-  forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad, int axis)
+  forward : fmax(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
@@ -587,8 +587,8 @@
     func : fmax_grad
 
 - backward_op : fmin_grad
-  forward : fmin(Tensor x, Tensor y, int axis) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad, int axis)
+  forward : fmin(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
@@ -684,8 +684,8 @@
     func : gumbel_softmax_grad
 
 - backward_op : hardswish_grad
-  forward : hardswish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float threshold, float scale, float offset)
+  forward : hardswish (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float threshold = 6.0, float scale = 6.0, float offset = 3.0)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -1418,8 +1418,8 @@
   invoke : real_grad_impl(out_grad, x_grad)
 
 - backward_op : relu6_grad
-  forward : relu6 (Tensor x, float threshold) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad, float threshold)
+  forward : relu6 (Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad, float threshold = 6)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -1810,7 +1810,7 @@
   optional: u_grad, vh_grad, s_grad
 
 - backward_op : swish_grad
-  forward : swish (Tensor x, float beta=1.0) -> Tensor(out)
+  forward : swish (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float bete=1.0)
   output : Tensor(x_grad)
   infer_meta :
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 674726b549fb689e6580457498dc820808646274..5d54b5c940bdb6bd1feeaea69f912efe311d08e7 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -97,7 +97,7 @@
   backward : addmm_grad
 
 - op : affine_grid
-  args : (Tensor input, IntArray outputShape, bool use_cudnn=true, bool align_corners=true)
+  args : (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true)
   output : Tensor
   infer_meta :
     func : AffineGridInferMeta
@@ -649,7 +649,7 @@
     backend : place > x
 
 - op : equal
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -751,7 +751,7 @@
     func : floor_divide
 
 - op : fmax
-  args : (Tensor x, Tensor y,  int axis)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     param: [x, y]
@@ -761,7 +761,7 @@
   backward : fmax_grad
 
 - op : fmin
-  args : (Tensor x, Tensor y,  int axis)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     param: [x, y]
@@ -898,7 +898,7 @@
     func : generate_proposals_v2
 
 - op : greater_equal
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -906,7 +906,7 @@
     func : greater_equal
 
 - op : greater_than
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -945,7 +945,7 @@
   backward : gumbel_softmax_grad
 
 - op : hardswish
-  args : (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0)
+  args : (Tensor x)
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
@@ -1180,7 +1180,7 @@
   backward : lerp_grad
 
 - op : less_equal
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -1188,7 +1188,7 @@
     func : less_equal
 
 - op : less_than
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -1623,7 +1623,7 @@
   backward : norm_grad
 
 - op : not_equal
-  args : (Tensor x, Tensor y, int axis = -1)
+  args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
@@ -1820,7 +1820,7 @@
   backward : real_grad
 
 - op : relu6
-  args : (Tensor x, float threshold)
+  args : (Tensor x)
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
@@ -2192,9 +2192,8 @@
     func : svd
   backward : svd_grad
 
-# The python API paddle.nn.functional.swish has no `bete` argument, it may be removed later
 - op : swish
-  args : (Tensor x, float beta=1.0)
+  args : (Tensor x)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/api/yaml/sparse_backward.yaml b/paddle/phi/api/yaml/sparse_backward.yaml
index 54dec292efb9d833802777497b0b99155e3845a4..3e654210b9008280e12a9429c17a952991e16002 100644
--- a/paddle/phi/api/yaml/sparse_backward.yaml
+++ b/paddle/phi/api/yaml/sparse_backward.yaml
@@ -251,8 +251,8 @@
            pow_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
 - backward_op : relu6_grad
-  forward : relu6(Tensor x, float threshold) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad, float threshold)
+  forward : relu6(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad, float threshold = 6)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/api/yaml/sparse_ops.yaml b/paddle/phi/api/yaml/sparse_ops.yaml
index ea35737a704b83d7e3c5ee79445a6971f4a9d10c..545042b6f073e447b4c6e7e342c0be82e791e3b6 100644
--- a/paddle/phi/api/yaml/sparse_ops.yaml
+++ b/paddle/phi/api/yaml/sparse_ops.yaml
@@ -213,7 +213,7 @@
   backward : relu_grad
 
 - op : relu6
-  args : (Tensor x, float threshold)
+  args : (Tensor x)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 578bfc37cdf68311d8d5d73611898afd5f6168f5..466a60be250a02e34e561144823b39ee706f153a 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -328,10 +328,10 @@ void CholeskySolveInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
-void CompareInferMeta(const MetaTensor& x,
-                      const MetaTensor& y,
-                      int axis,
-                      MetaTensor* out) {
+void CompareRawInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         int axis,
+                         MetaTensor* out) {
   auto dim_x = x.dims();
   auto dim_y = y.dims();
 
@@ -358,6 +358,12 @@ void CompareInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::BOOL);
 }
 
+void CompareInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      MetaTensor* out) {
+  CompareRawInferMeta(x, y, -1, out);
+}
+
 void CompareAllInferMeta(const MetaTensor& x,
                          const MetaTensor& y,
                          MetaTensor* out) {
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 2d3bbf516f236fe544c81db60fb050a160c0138e..e550118374299bf887fb32c6809eef2a47edd6c8 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -69,9 +69,13 @@ void CompareAllInferMeta(const MetaTensor& x,
 
 void CompareInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
-                      int axis,
                       MetaTensor* out);
 
+void CompareRawInferMeta(const MetaTensor& x,
+                         const MetaTensor& y,
+                         int axis,
+                         MetaTensor* out);
+
 void ComplexInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
                       MetaTensor* out);
diff --git a/paddle/phi/kernels/activation_kernel.cc b/paddle/phi/kernels/activation_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26ce10324636fd1c1be607f1a6ef7410d970979e
--- /dev/null
+++ b/paddle/phi/kernels/activation_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HardSwishKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     DenseTensor* out) {
+  HardSwishRawKernel<T, Context>(dev_ctx, x, 6, 6, 3, out);
+}
+
+template <typename T, typename Context>
+void Relu6Kernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DenseTensor* out) {
+  Relu6RawKernel<T, Context>(dev_ctx, x, 6, out);
+}
+
+template <typename T, typename Context>
+void SwishKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DenseTensor* out) {
+  SwishRawKernel<T, Context>(dev_ctx, x, 1.0, out);
+}
+
+}  // namespace phi
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(
+    hard_swish, CPU, ALL_LAYOUT, phi::HardSwishKernel, float, double) {}
+PD_REGISTER_KERNEL(relu6, CPU, ALL_LAYOUT, phi::Relu6Kernel, float, double) {}
+PD_REGISTER_KERNEL(swish, CPU, ALL_LAYOUT, phi::SwishKernel, float, double) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(hard_swish,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::HardSwishKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(relu6,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Relu6Kernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(swish,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SwishKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#endif
+
+#if defined PADDLE_WITH_XPU
+PD_REGISTER_KERNEL(hard_swish, XPU, ALL_LAYOUT, phi::HardSwishKernel, float) {}
+PD_REGISTER_KERNEL(relu6, XPU, ALL_LAYOUT, phi::Relu6Kernel, float) {}
+PD_REGISTER_KERNEL(swish, XPU, ALL_LAYOUT, phi::SwishKernel, float) {}
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+PD_REGISTER_KERNEL(hard_swish,
+                   OneDNN,
+                   ONEDNN,
+                   phi::HardSwishKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(
+    relu6, OneDNN, ONEDNN, phi::Relu6Kernel, float, phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(
+    swish, OneDNN, ONEDNN, phi::SwishKernel, float, phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index 15545b931da0589ca9f7305c4eb7892539759ef6..9ea8423253ad42103c1627a4a93ce3611600b277 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -75,13 +75,13 @@ DECLARE_ACTIVATION_KERNEL(Negative)
 
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6Raw, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SwishRaw, beta)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps)
 
@@ -90,14 +90,29 @@ DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 
+template <typename T, typename Context>
+void HardSwishRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        float scale,
+                        float offset,
+                        DenseTensor* out);
+
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     float threshold,
-                     float scale,
-                     float offset,
                      DenseTensor* out);
 
+template <typename T, typename Context>
+void Relu6Kernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DenseTensor* out);
+
+template <typename T, typename Context>
+void SwishKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DenseTensor* out);
+
 template <typename T, typename Context>
 void PowKernel(const Context& dev_ctx,
                const DenseTensor& x,
diff --git a/paddle/phi/kernels/compare_kernel.h b/paddle/phi/kernels/compare_kernel.h
index 5b6b8cd868f9fcb9048e00526b272e3cd4c54682..958bb85e4ceae2cba95ec645247ab29162f44752 100644
--- a/paddle/phi/kernels/compare_kernel.h
+++ b/paddle/phi/kernels/compare_kernel.h
@@ -18,20 +18,25 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECALRE_COMPARE_KERNEL(compare_kernel) \
-  template <typename T, typename Context>      \
-  void compare_kernel(const Context& ctx,      \
-                      const DenseTensor& x,    \
-                      const DenseTensor& y,    \
-                      int axis,                \
-                      DenseTensor* out);
-
-DECALRE_COMPARE_KERNEL(LessThanKernel)
-DECALRE_COMPARE_KERNEL(LessEqualKernel)
-DECALRE_COMPARE_KERNEL(GreaterThanKernel)
-DECALRE_COMPARE_KERNEL(GreaterEqualKernel)
-DECALRE_COMPARE_KERNEL(EqualKernel)
-DECALRE_COMPARE_KERNEL(NotEqualKernel)
+#define DECALRE_COMPARE_KERNEL(name)         \
+  template <typename T, typename Context>    \
+  void name##RawKernel(const Context& ctx,   \
+                       const DenseTensor& x, \
+                       const DenseTensor& y, \
+                       int axis,             \
+                       DenseTensor* out);    \
+  template <typename T, typename Context>    \
+  void name##Kernel(const Context& ctx,      \
+                    const DenseTensor& x,    \
+                    const DenseTensor& y,    \
+                    DenseTensor* out);
+
+DECALRE_COMPARE_KERNEL(LessThan)
+DECALRE_COMPARE_KERNEL(LessEqual)
+DECALRE_COMPARE_KERNEL(GreaterThan)
+DECALRE_COMPARE_KERNEL(GreaterEqual)
+DECALRE_COMPARE_KERNEL(Equal)
+DECALRE_COMPARE_KERNEL(NotEqual)
 #undef DECALRE_COMPARE_KERNEL
 
 #define DECALRE_COMPARE_ALL_KERNEL(compare_all_kernel) \
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index f5ff006b3acf509149680e69429c4217852e8a42..f3905c1f805af4dcb18ad7605f53d2816fbbf37e 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -96,12 +96,12 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                      ThresholdedReluFunctor,
                                      threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, Relu6Functor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6Raw, Relu6Functor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
-DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SwishRaw, SwishFunctor, beta)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CELUFunctor, alpha)
 
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh, HardTanhFunctor, t_min, t_max)
@@ -113,12 +113,12 @@ DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      offset)
 
 template <typename T, typename Context>
-void HardSwishKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float threshold,
-                     float scale,
-                     float offset,
-                     DenseTensor* out) {
+void HardSwishRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        float scale,
+                        float offset,
+                        DenseTensor* out) {
   funcs::HardSwishFunctor<T> functor;
   auto attrs = functor.GetAttrs();
   *(attrs[0].second) = threshold;
@@ -149,7 +149,7 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_tanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6_raw, Relu6RawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softshrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
@@ -182,8 +182,8 @@ PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
 PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_swish_raw, HardSwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
index ae6c3fd5cb020110d5b8b95ab313e94d6724b843..ffea6a32528b3e9aac885438870176f44cf65104 100644
--- a/paddle/phi/kernels/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -71,73 +71,6 @@ inline void CompareAllKernelImpl(const Context& ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(less_than,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::LessThanKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(less_equal,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::LessEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(greater_than,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::GreaterThanKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(greater_equal,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::GreaterEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(equal,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::EqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(not_equal,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::NotEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-
 PD_REGISTER_KERNEL(equal_all,
                    CPU,
                    ALL_LAYOUT,
@@ -147,3 +80,33 @@ PD_REGISTER_KERNEL(equal_all,
                    int64_t,
                    float,
                    double) {}
+
+#define PD_REGISTER_COMPARE_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                     \
+                     CPU,                      \
+                     ALL_LAYOUT,               \
+                     phi::func##Kernel,        \
+                     bool,                     \
+                     int16_t,                  \
+                     int,                      \
+                     int64_t,                  \
+                     float,                    \
+                     double,                   \
+                     phi::dtype::float16) {}   \
+  PD_REGISTER_KERNEL(name##_raw,               \
+                     CPU,                      \
+                     ALL_LAYOUT,               \
+                     phi::func##RawKernel,     \
+                     bool,                     \
+                     int16_t,                  \
+                     int,                      \
+                     int64_t,                  \
+                     float,                    \
+                     double,                   \
+                     phi::dtype::float16) {}
+PD_REGISTER_COMPARE_KERNEL(less_than, LessThan)
+PD_REGISTER_COMPARE_KERNEL(less_equal, LessEqual)
+PD_REGISTER_COMPARE_KERNEL(greater_than, GreaterThan)
+PD_REGISTER_COMPARE_KERNEL(greater_equal, GreaterEqual)
+PD_REGISTER_COMPARE_KERNEL(equal, Equal)
+PD_REGISTER_COMPARE_KERNEL(not_equal, NotEqual)
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 3e16d75377e37d2afbb316a6fdefc19d68a9b41a..494929baf9360ad197f28451501b9d6be2ce0bd5 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -122,11 +122,23 @@ using complex128 = ::phi::dtype::complex<double>;
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
 
-PD_REGISTER_KERNEL(
-    fmax, CPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(fmax_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FMaxRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 
-PD_REGISTER_KERNEL(
-    fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(fmin_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FMinRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 
 PD_REGISTER_KERNEL(maximum_raw,
                    CPU,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 88551b34109b6e6317a65c161be38b5270a12148..c6031b34af249c6e054a27d22d0f726ea0ea91cb 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -110,10 +110,32 @@ void SubtractKernel(const Context& dev_ctx,
   SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void FMaxKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out) {
+  FMaxRawKernel<T, Context>(dev_ctx, x, y, -1, out);
+}
+
+template <typename T, typename Context>
+void FMinKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                DenseTensor* out) {
+  FMinRawKernel<T, Context>(dev_ctx, x, y, -1, out);
+}
+
 }  // namespace phi
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
+PD_REGISTER_KERNEL(
+    fmax, CPU, ALL_LAYOUT, phi::FMaxKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
+
 PD_REGISTER_KERNEL(maximum,
                    CPU,
                    ALL_LAYOUT,
@@ -210,6 +232,26 @@ PD_REGISTER_KERNEL(divide,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+PD_REGISTER_KERNEL(fmax,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::FMaxKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::float16,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(fmin,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::FMinKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::float16,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(maximum,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 65040e1937a59a399549ae3637f1115e2d13bdec..ca9de280c577b898674c6d8694e0d1805a120cb0 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -19,18 +19,30 @@
 
 namespace phi {
 
+template <typename T, typename Context>
+void FMaxRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis,
+                   DenseTensor* out);
+
 template <typename T, typename Context>
 void FMaxKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y,
-                int axis,
                 DenseTensor* out);
 
+template <typename T, typename Context>
+void FMinRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis,
+                   DenseTensor* out);
+
 template <typename T, typename Context>
 void FMinKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y,
-                int axis,
                 DenseTensor* out);
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index fd4cdf9c0df58c926f14f11478dfbfb08a40748a..df8ae72346a6dff0ec5c39ef39575c5877e29a0b 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -112,13 +112,13 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                      CudaThresholdedReluFunctor,
                                      threshold)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, CudaRelu6Functor, threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Relu6Raw, CudaRelu6Functor, threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                      CudaHardShrinkFunctor,
                                      threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SwishRaw, CudaSwishFunctor, beta)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CudaCELUFunctor, alpha)
 
@@ -138,12 +138,12 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Selu, CudaSeluFunctor, scale, alpha)
 
 template <typename T, typename Context>
-void HardSwishKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float threshold,
-                     float scale,
-                     float offset,
-                     DenseTensor* out) {
+void HardSwishRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        float scale,
+                        float offset,
+                        DenseTensor* out) {
   funcs::CudaHardSwishFunctor<T> functor;
   auto attrs = functor.GetAttrs();
   *(attrs[0].second) = threshold;
@@ -198,7 +198,7 @@ PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_tanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6_raw, Relu6RawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
@@ -254,8 +254,8 @@ PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
 PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_swish_raw, HardSwishRawKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
index 2a8b858856c0770e06687ee798c9c1480a8cac5d..f9322db88400a5df4ef3c03bd0843811f7a5daad 100644
--- a/paddle/phi/kernels/impl/compare_kernel_impl.h
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -36,33 +36,38 @@ inline void CompareAllKernelImpl(const Context& ctx,
                                  const DenseTensor& y,
                                  DenseTensor* out);
 
-#define DEFINE_COMPARE_KERNEL(compare_kernel, functor, inverse_functor) \
-  template <typename T, typename Context>                               \
-  void compare_kernel(const Context& ctx,                               \
-                      const DenseTensor& x,                             \
-                      const DenseTensor& y,                             \
-                      int axis,                                         \
-                      DenseTensor* out) {                               \
-    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>(      \
-        ctx, x, y, axis, out);                                          \
+#define DEFINE_COMPARE_KERNEL(name, functor, inverse_functor)      \
+  template <typename T, typename Context>                          \
+  void name##RawKernel(const Context& ctx,                         \
+                       const DenseTensor& x,                       \
+                       const DenseTensor& y,                       \
+                       int axis,                                   \
+                       DenseTensor* out) {                         \
+    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>( \
+        ctx, x, y, axis, out);                                     \
+  }                                                                \
+  template <typename T, typename Context>                          \
+  void name##Kernel(const Context& ctx,                            \
+                    const DenseTensor& x,                          \
+                    const DenseTensor& y,                          \
+                    DenseTensor* out) {                            \
+    name##RawKernel<T, Context>(ctx, x, y, -1, out);               \
   }
 
-DEFINE_COMPARE_KERNEL(LessThanKernel,
+DEFINE_COMPARE_KERNEL(LessThan,
                       funcs::LessThanFunctor,
                       funcs::GreaterThanFunctor)
-DEFINE_COMPARE_KERNEL(LessEqualKernel,
+DEFINE_COMPARE_KERNEL(LessEqual,
                       funcs::LessEqualFunctor,
                       funcs::GreaterEqualFunctor)
-DEFINE_COMPARE_KERNEL(GreaterThanKernel,
+DEFINE_COMPARE_KERNEL(GreaterThan,
                       funcs::GreaterThanFunctor,
                       funcs::LessThanFunctor)
-DEFINE_COMPARE_KERNEL(GreaterEqualKernel,
+DEFINE_COMPARE_KERNEL(GreaterEqual,
                       funcs::GreaterEqualFunctor,
                       funcs::LessEqualFunctor)
-DEFINE_COMPARE_KERNEL(EqualKernel, funcs::EqualFunctor, funcs::EqualFunctor)
-DEFINE_COMPARE_KERNEL(NotEqualKernel,
-                      funcs::NotEqualFunctor,
-                      funcs::NotEqualFunctor)
+DEFINE_COMPARE_KERNEL(Equal, funcs::EqualFunctor, funcs::EqualFunctor)
+DEFINE_COMPARE_KERNEL(NotEqual, funcs::NotEqualFunctor, funcs::NotEqualFunctor)
 #undef DEFINE_COMPARE_KERNEL
 
 #define DEFINE_COMPARE_ALL_KERNEL(compare_all_kernel, functor)    \
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 4f1e7af582c96357d34bbdd91d9a685546c9558a..6974881e49ad41705de3782ea7e32612aadefcd0 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -67,22 +67,22 @@ namespace phi {
   }
 
 template <typename T, typename Context>
-void FMaxKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                int axis,
-                DenseTensor* out) {
+void FMaxRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis,
+                   DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::FMaxFunctor<T>, T, T>(
       dev_ctx, x, y, axis, funcs::FMaxFunctor<T>(), out);
 }
 
 template <typename T, typename Context>
-void FMinKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                int axis,
-                DenseTensor* out) {
+void FMinRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis,
+                   DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
   funcs::ElementwiseCompute<funcs::FMinFunctor<T>, T, T>(
       dev_ctx, x, y, axis, funcs::FMinFunctor<T>(), out);
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index b882fcc2a6c960032936c040bb1604776f60de6a..3ec61f521be530c434616e6d08d6db7706b0b90e 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -103,79 +103,20 @@ PD_REGISTER_KERNEL(
     greater_equal, KPS, ALL_LAYOUT, phi::GreaterEqualKernel, int) {}
 PD_REGISTER_KERNEL(equal, KPS, ALL_LAYOUT, phi::EqualKernel, int) {}
 PD_REGISTER_KERNEL(not_equal, KPS, ALL_LAYOUT, phi::NotEqualKernel, int) {}
+
+PD_REGISTER_KERNEL(
+    less_than_raw, KPS, ALL_LAYOUT, phi::LessThanRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    less_equal_raw, KPS, ALL_LAYOUT, phi::LessEqualRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    greater_than_raw, KPS, ALL_LAYOUT, phi::GreaterThanRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    greater_equal_raw, KPS, ALL_LAYOUT, phi::GreaterEqualRawKernel, int) {}
+PD_REGISTER_KERNEL(equal_raw, KPS, ALL_LAYOUT, phi::EqualRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    not_equal_raw, KPS, ALL_LAYOUT, phi::NotEqualRawKernel, int) {}
+
 #else
-PD_REGISTER_KERNEL(less_than,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::LessThanKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(less_equal,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::LessEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(greater_than,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::GreaterThanKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(greater_equal,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::GreaterEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(equal,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::EqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(not_equal,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::NotEqualKernel,
-                   bool,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(equal_all,
                    KPS,
@@ -186,4 +127,38 @@ PD_REGISTER_KERNEL(equal_all,
                    int64_t,
                    float,
                    double) {}
+
+#define PD_REGISTER_COMPARE_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                     \
+                     KPS,                      \
+                     ALL_LAYOUT,               \
+                     phi::func##Kernel,        \
+                     bool,                     \
+                     int16_t,                  \
+                     int,                      \
+                     int64_t,                  \
+                     float,                    \
+                     double,                   \
+                     phi::dtype::float16,      \
+                     phi::dtype::bfloat16) {}  \
+  PD_REGISTER_KERNEL(name##_raw,               \
+                     KPS,                      \
+                     ALL_LAYOUT,               \
+                     phi::func##RawKernel,     \
+                     bool,                     \
+                     int16_t,                  \
+                     int,                      \
+                     int64_t,                  \
+                     float,                    \
+                     double,                   \
+                     phi::dtype::float16,      \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_COMPARE_KERNEL(less_than, LessThan)
+PD_REGISTER_COMPARE_KERNEL(less_equal, LessEqual)
+PD_REGISTER_COMPARE_KERNEL(greater_than, GreaterThan)
+PD_REGISTER_COMPARE_KERNEL(greater_equal, GreaterEqual)
+PD_REGISTER_COMPARE_KERNEL(equal, Equal)
+PD_REGISTER_COMPARE_KERNEL(not_equal, NotEqual)
+
 #endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 346c836814769a58c66be42cc293851d2447aadf..ecabc7ef35a3ab8252d8ec1e3adcfef256eccd92 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -93,20 +93,20 @@ using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(fmax,
+PD_REGISTER_KERNEL(fmax_raw,
                    KPS,
                    ALL_LAYOUT,
-                   phi::FMaxKernel,
+                   phi::FMaxRawKernel,
                    float,
                    double,
                    int,
                    float16,
                    int64_t) {}
 
-PD_REGISTER_KERNEL(fmin,
+PD_REGISTER_KERNEL(fmin_raw,
                    KPS,
                    ALL_LAYOUT,
-                   phi::FMinKernel,
+                   phi::FMinRawKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc
index b9db43fa03b962b735b38da23c8b198cbeff9c80..c6367c826cfcfe6e0eac16224519af452f56ad2f 100644
--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
@@ -154,15 +154,15 @@ DEFINE_ONEDNN_ACTIVATION_KERNEL(Round, RoundOneDNNFunctor)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluOneDNNFunctor, alpha)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, ReluOneDNNFunctor, alpha)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishOneDNNFunctor, threshold)
-DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishOneDNNFunctor, beta)
+DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(SwishRaw, SwishOneDNNFunctor, beta)
 
 template <typename T, typename Context>
-void HardSwishKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float threshold,
-                     float scale,
-                     float offset,
-                     DenseTensor* out) {
+void HardSwishRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        float scale,
+                        float offset,
+                        DenseTensor* out) {
   HardSwishOneDNNFunctor<T> functor;
   functor(dev_ctx, x, threshold, 0, out);
 }
@@ -182,10 +182,10 @@ void GeluKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Relu6Kernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float threshold,
-                 DenseTensor* out) {
+void Relu6RawKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    float threshold,
+                    DenseTensor* out) {
   Relu6OneDNNFunctor<T> functor;
   functor(dev_ctx, x, 0, threshold, out);
 }
@@ -202,12 +202,12 @@ PD_REGISTER_ACTIVATION_KERNEL(abs, AbsKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)
 PD_REGISTER_ACTIVATION_KERNEL(gelu, GeluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_swish_raw, HardSwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(relu, ReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6_raw, Relu6RawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
diff --git a/paddle/phi/kernels/sparse/cpu/unary_kernel.cc b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc
index a8fc928108cbd9ebf67a693faa9c82547b98952c..4bbb97936e6e49e3339aa3284a9cd0db7cf6ef4a 100644
--- a/paddle/phi/kernels/sparse/cpu/unary_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc
@@ -95,6 +95,7 @@ PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(pow, Pow)
 PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(scale, Scale)
 PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(expm1, Expm1)
 PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(relu6, Relu6)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(relu6_raw, Relu6Raw)
 PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(leaky_relu, LeakyRelu)
 
 PD_REGISTER_KERNEL(divide_scalar_coo,
diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
index 5ff222720aceb251498498e7595af8479e601450..98a7248b9845a89821d079b5e31f9ef9dfe5ff65 100644
--- a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
@@ -99,6 +99,7 @@ PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(abs, Abs)
 PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(pow, Pow)
 PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(scale, Scale)
 PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(expm1, Expm1)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(relu6_raw, Relu6Raw)
 PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(relu6, Relu6)
 PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(leaky_relu, LeakyRelu)
 
diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
index a4b89fd813270e23c386d4c6e5b4f69eb235316d..426e580262dbc0759694f6c973f876589b9c42da 100644
--- a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
+++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
@@ -89,9 +89,23 @@ DEFINE_SPARSE_UNARY_KERNEL(Relu)
 DEFINE_SPARSE_UNARY_KERNEL(Abs)
 DEFINE_SPARSE_UNARY_KERNEL(Expm1)
 DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Pow, factor)
-DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
+DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Relu6Raw, threshold)
 DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
 
+template <typename T, typename Context>
+void Relu6CooKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out) {
+  Relu6RawCooKernel<T, Context>(dev_ctx, x, 6, out);
+}
+
+template <typename T, typename Context>
+void Relu6CsrKernel(const Context& dev_ctx,
+                    const SparseCsrTensor& x,
+                    SparseCsrTensor* out) {
+  Relu6RawCsrKernel<T, Context>(dev_ctx, x, 6, out);
+}
+
 template <typename T, typename Context>
 void ScaleCooKernel(const Context& dev_ctx,
                     const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
index bd4b34892d57f05a90160bc437a58c96237df089..73aae275d6941eee4b30958fbf3c86b90ef1ac9a 100644
--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -356,10 +356,10 @@ struct XPUMishFunctor : public funcs::BaseActivationFunctor<T> {
 };
 
 template <typename T, typename Context>
-void SwishKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float beta,
-                 DenseTensor* out) {
+void SwishRawKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    float beta,
+                    DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
   int r = xpu::swish(dev_ctx.x_context(),
@@ -415,7 +415,9 @@ DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, XPUMishFunctor, threshold)
 DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu,
                                             XPULeakyReluFunctor,
                                             alpha)
-DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, XPURelu6Functor, threshold)
+DEFINE_XPU_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6Raw,
+                                            XPURelu6Functor,
+                                            threshold)
 
 DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus,
                                             XPUSoftplusFunctor,
@@ -423,12 +425,12 @@ DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus,
                                             threshold)
 
 template <typename T, typename Context>
-void HardSwishKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     float threshold,
-                     float scale,
-                     float offset,
-                     DenseTensor* out) {
+void HardSwishRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        float scale,
+                        float offset,
+                        DenseTensor* out) {
   XPUHardSwishFunctor<T> functor;
   auto attrs = functor.GetAttrs();
   *(attrs[0].second) = threshold;
@@ -452,13 +454,13 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)  // no grad
 PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_swish_raw, HardSwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(pow, PowKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
-PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6_raw, Relu6RawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
-PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel)
 PD_REGISTER_ACTIVATION_KERNEL(square, SquareKernel)
diff --git a/paddle/phi/kernels/xpu/compare_kernel.cc b/paddle/phi/kernels/xpu/compare_kernel.cc
index bda9e81c2a1565d7d7c17da996faab8a195b9fd9..7e46b9da647867bde953680122d34a42709674da 100644
--- a/paddle/phi/kernels/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/xpu/compare_kernel.cc
@@ -52,48 +52,59 @@ void XPUCompareKernelImpl(const Context& dev_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "compare op");
 }
 
-#define DEFINE_XPU_COMPARE_KERNEL(compare_kernel, functor)                  \
+#define DEFINE_XPU_COMPARE_KERNEL(name, functor)                            \
   template <typename T, typename Context>                                   \
-  void compare_kernel(const Context& dev_ctx,                               \
-                      const DenseTensor& x,                                 \
-                      const DenseTensor& y,                                 \
-                      int axis,                                             \
-                      DenseTensor* out) {                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
     using XPUType = typename XPUTypeTrait<T>::Type;                         \
     XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, functor); \
+  }                                                                         \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& dev_ctx,                                 \
+                    const DenseTensor& x,                                   \
+                    const DenseTensor& y,                                   \
+                    DenseTensor* out) {                                     \
+    name##RawKernel<T, Context>(dev_ctx, x, y, -1, out);                    \
   }
 
-DEFINE_XPU_COMPARE_KERNEL(EqualKernel, xpu::broadcast_equal<XPUType>)
-DEFINE_XPU_COMPARE_KERNEL(NotEqualKernel, xpu::broadcast_not_equal<XPUType>)
-DEFINE_XPU_COMPARE_KERNEL(LessThanKernel, xpu::broadcast_less_than<XPUType>)
-DEFINE_XPU_COMPARE_KERNEL(LessEqualKernel, xpu::broadcast_less_equal<XPUType>)
-DEFINE_XPU_COMPARE_KERNEL(GreaterThanKernel,
-                          xpu::broadcast_greater_than<XPUType>)
-DEFINE_XPU_COMPARE_KERNEL(GreaterEqualKernel,
-                          xpu::broadcast_greater_equal<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(Equal, xpu::broadcast_equal<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(NotEqual, xpu::broadcast_not_equal<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(LessThan, xpu::broadcast_less_than<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(LessEqual, xpu::broadcast_less_equal<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(GreaterThan, xpu::broadcast_greater_than<XPUType>)
+DEFINE_XPU_COMPARE_KERNEL(GreaterEqual, xpu::broadcast_greater_equal<XPUType>)
+
 #undef DEFINE_XPU_COMPARE_KERNEL
 
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    equal, XPU, ALL_LAYOUT, phi::EqualKernel, float, int, int64_t) {}
-PD_REGISTER_KERNEL(
-    not_equal, XPU, ALL_LAYOUT, phi::NotEqualKernel, float, int, int64_t) {}
-PD_REGISTER_KERNEL(
-    less_than, XPU, ALL_LAYOUT, phi::LessThanKernel, float, int, int64_t) {}
-PD_REGISTER_KERNEL(
-    less_equal, XPU, ALL_LAYOUT, phi::LessEqualKernel, float, int, int64_t) {}
-PD_REGISTER_KERNEL(greater_than,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::GreaterThanKernel,
-                   float,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(greater_equal,
+    less_than, XPU, ALL_LAYOUT, phi::LessThanKernel, int, int64_t, float) {}
+
+PD_REGISTER_KERNEL(less_than_raw,
                    XPU,
                    ALL_LAYOUT,
-                   phi::GreaterEqualKernel,
-                   float,
+                   phi::LessThanRawKernel,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   float) {}
+
+#define PD_REGISTER_COMPARE_KERNEL(name, func)                          \
+  PD_REGISTER_KERNEL(                                                   \
+      name, XPU, ALL_LAYOUT, phi::func##Kernel, int, int64_t, float) {} \
+  PD_REGISTER_KERNEL(name##_raw,                                        \
+                     XPU,                                               \
+                     ALL_LAYOUT,                                        \
+                     phi::func##RawKernel,                              \
+                     int,                                               \
+                     int64_t,                                           \
+                     float) {}
+
+PD_REGISTER_COMPARE_KERNEL(less_equal, LessEqual)
+PD_REGISTER_COMPARE_KERNEL(greater_than, GreaterThan)
+PD_REGISTER_COMPARE_KERNEL(greater_equal, GreaterEqual)
+PD_REGISTER_COMPARE_KERNEL(equal, Equal)
+PD_REGISTER_COMPARE_KERNEL(not_equal, NotEqual)
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index fbff006ee93af69450796308037b5db7a46d4495..da61faf543e1f29dd0905d1c1827af0b36987e5e 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -53,6 +53,19 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(STanh,
 
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold");  // NOLINT
 
+KernelSignature HardSwishOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "hard_swish_raw", {"X"}, {"threshold", "scale", "offset"}, {"Out"});
+}
+
+KernelSignature SwishOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("swish_raw", {"X"}, {"beta"}, {"Out"});
+}
+
+KernelSignature Relu6OpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu6_raw", {"X"}, {"threshold"}, {"Out"});
+}
+
 KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.HasInput("FactorTensor")) {
     return KernelSignature("pow", {"X"}, {"FactorTensor"}, {"Out"});
@@ -108,10 +121,12 @@ PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping);
 
 PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::HardTanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping);
-
+PD_REGISTER_ARG_MAPPING_FN(relu6, phi::Relu6OpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad,
                            phi::HardSwishGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_swish, phi::HardSwishOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(swish, phi::SwishOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(pow_double_grad,
                            phi::PowDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/compare_sig.cc b/paddle/phi/ops/compat/compare_sig.cc
index 10e50cb062cddc414249f01fc226b9527ddf1ec8..d237fc77f99e9f468f3884d5addefdb467db6a70 100644
--- a/paddle/phi/ops/compat/compare_sig.cc
+++ b/paddle/phi/ops/compat/compare_sig.cc
@@ -17,27 +17,27 @@
 namespace phi {
 
 KernelSignature LessThanArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("less_than", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("less_than_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature LessEqualArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("less_equal", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("less_equal_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature GreaterThanArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("greater_than", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("greater_than_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature GreaterEqualArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("greater_equal", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("greater_equal_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature EqualArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("equal", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("equal_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature NotEqualArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("not_equal", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("not_equal_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 1d82ceaf1dea3b354e21a83af3daa8ca8ada7e7f..371944aadc789d65d45ee89d3930c90b145b03b4 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -181,12 +181,12 @@ KernelSignature ElementwiseMulGradOpArgumentMapping(
 
 KernelSignature ElementwiseFMaxOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  return KernelSignature("fmax", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("fmax_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature ElementwiseFMinOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  return KernelSignature("fmin", {"X", "Y"}, {"axis"}, {"Out"});
+  return KernelSignature("fmin_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
 KernelSignature ElementwiseFMaxGradOpArgumentMapping(
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 91065258a13ad89808c607928fac6911b36946b6..62523afc9aabcaebfc2f40671218eb65f96902e2 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2075,7 +2075,7 @@ def greater_than(x, y, cond=None, name=None):
     attrs = dict()
 
     if in_dygraph_mode():
-        return _C_ops.greater_than(x, y, -1)
+        return _C_ops.greater_than(x, y)
     else:
         helper.append_op(
             type='greater_than',
@@ -2173,8 +2173,7 @@ def equal(x, y, cond=None, name=None):
           out2 = fluid.layers.equal(x=label_cond,y=limit, cond=out_cond) #out2=[False, True] out_cond=[False, True]
     """
     if in_dygraph_mode():
-        default_axis = -1
-        return _C_ops.equal(x, y, default_axis)
+        return _C_ops.equal(x, y)
 
     check_variable_and_dtype(
         x, "x", ["float32", "float64", "int32", "int64"], "equal"
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 367d1e6399032f77ac06618b9740ef93742f3779..1dd6ef6776750c01fa78b6e6a269fea0df63f33d 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git)
-  set(PLUGIN_TAG 0698428ddba21e6baecb690579f37c48896f7d56)
+  set(PLUGIN_TAG develop)
 
   file(
     GLOB TEST_OPS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 44879745ba1bb63553d915386f0d2ec3c024ef83..f8eb9d35d58acf03cca7784be88c3feebe3e0675 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -402,7 +402,7 @@ def hardswish(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.hard_swish(x)
     if in_dygraph_mode():
-        return _C_ops.hardswish(x, 6, 6, 3)
+        return _C_ops.hardswish(x)
 
     check_variable_and_dtype(
         x, 'x', ['float16', 'float32', 'float64'], 'hardswish'
@@ -893,7 +893,7 @@ def relu6(x, name=None):
     """
     threshold = 6.0
     if in_dygraph_mode():
-        return _C_ops.relu6(x, threshold)
+        return _C_ops.relu6(x)
     if in_dynamic_mode():
         return _legacy_C_ops.relu6(x, 'threshold', threshold)
 
@@ -1388,7 +1388,7 @@ def swish(x, name=None):
             #        [-0.23840584,  0.        ,  0.73105854])
     """
     if in_dygraph_mode():
-        return _C_ops.swish(x, 1.0)
+        return _C_ops.swish(x)
     if _in_legacy_dygraph():
         return _legacy_C_ops.swish(x, 'beta', 1.0)
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index bc77dcefa45f931ccd497581201a3ffcdcb6e25b..ebe1ec7e9bcdde4207aa5902daed89949a949f40 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -92,7 +92,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
             if isinstance(out_shape, Variable)
             else out_shape
         )
-        return _C_ops.affine_grid(theta, _out_shape, use_cudnn, align_corners)
+        return _C_ops.affine_grid(theta, _out_shape, align_corners, use_cudnn)
     elif in_dynamic_mode():
         _out_shape = (
             out_shape.numpy().tolist()
diff --git a/python/paddle/sparse/nn/functional/activation.py b/python/paddle/sparse/nn/functional/activation.py
index a50a64f5164fe09c17796b0612c1984983214c36..cbe2ddd0d79dbfbfef6c57b516f560cf7cd0e702 100644
--- a/python/paddle/sparse/nn/functional/activation.py
+++ b/python/paddle/sparse/nn/functional/activation.py
@@ -140,7 +140,7 @@ def relu6(x, name=None):
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.sparse.nn.functional.relu6(sparse_x)
     """
-    return _C_ops.sparse_relu6(x, 6.0)
+    return _C_ops.sparse_relu6(x)
 
 
 @dygraph_only
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index bd7052153f6258c45ade2c0b10751562a37aa5ba..912c69245c0df8c2a07228e084850ec4bd4a8437 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -445,8 +445,7 @@ def equal(x, y, name=None):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
     if in_dygraph_mode():
-        default_axis = -1
-        return _C_ops.equal(x, y, default_axis)
+        return _C_ops.equal(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.equal(x, y)
@@ -502,8 +501,7 @@ def greater_equal(x, y, name=None):
             print(result1)  # result1 = [True False True]
     """
     if in_dygraph_mode():
-        default_axis = -1
-        return _C_ops.greater_equal(x, y, default_axis)
+        return _C_ops.greater_equal(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.greater_equal(x, y)
@@ -559,7 +557,7 @@ def greater_than(x, y, name=None):
             print(result1)  # result1 = [False False True]
     """
     if in_dygraph_mode():
-        return _C_ops.greater_than(x, y, -1)
+        return _C_ops.greater_than(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.greater_than(x, y)
@@ -616,8 +614,7 @@ def less_equal(x, y, name=None):
             print(result1)  # result1 = [True True False]
     """
     if in_dygraph_mode():
-        axis = -1
-        return _C_ops.less_equal(x, y, axis)
+        return _C_ops.less_equal(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.less_equal(x, y)
@@ -674,8 +671,7 @@ def less_than(x, y, name=None):
             print(result1)  # result1 = [False True False]
     """
     if in_dygraph_mode():
-        default_axis = -1
-        return _C_ops.less_than(x, y, default_axis)
+        return _C_ops.less_than(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.less_than(x, y)
@@ -732,8 +728,7 @@ def not_equal(x, y, name=None):
             print(result1)  # result1 = [False True True]
     """
     if in_dygraph_mode():
-        axis = -1
-        return _C_ops.not_equal(x, y, axis)
+        return _C_ops.not_equal(x, y)
     else:
         if _in_legacy_dygraph():
             return _legacy_C_ops.not_equal(x, y)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index a47ca0ed06b9cba00c72d6e2ad272f3dbe0023f1..cf6ff6633bb6f2cf6201955ba626625ad509afa9 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1168,7 +1168,7 @@ def fmax(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.fmax(x, y, axis)
+        return _C_ops.fmax(x, y)
     if _in_legacy_dygraph():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type
@@ -1236,7 +1236,7 @@ def fmin(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        return _C_ops.fmin(x, y, axis)
+        return _C_ops.fmin(x, y)
     if _in_legacy_dygraph():
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type