[PTen]Separate origin Kernel and add Kernel for C++ API (#39002)

* add kernel for c++ api * fix compile bugs * fix kunlun compile bugs * perfect cmake * fix compile bugs when run ci-inference * fix compile bugs * add non-raw kernel for fluid op * fix compile bugs * fix compile bugs * fix unit test bug

[PTen]Separate origin Kernel and add Kernel for C++ API (#39002)
* add kernel for c++ api * fix compile bugs * fix kunlun compile bugs * perfect cmake * fix compile bugs when run ci-inference * fix compile bugs * add non-raw kernel for fluid op * fix compile bugs * fix compile bugs * fix unit test bug
a0f586bc · YuanRisheng · GitHub · 854a7ab3 · a0f586bc · a0f586bc
18 changed file
--- a/cmake/pten_kernel.cmake
+++ b/cmake/pten_kernel.cmake
@@ -103,24 +103,27 @@ function(kernel_library TARGET)
    list(LENGTH gpu_srcs gpu_srcs_len)
    list(LENGTH xpu_srcs xpu_srcs_len)
-    if (${common_srcs_len} GREATER 0)
+    # Build Target according different src organization
-        # If the kernel has a device independent public implementation,
+    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        # we will use this implementation and will not adopt the implementation
+        ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
-        # under specific devices
+        # If the common_srcs depends on specific device srcs, build target using this rule.
        if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
        elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-        else()
+                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
            endif()
        else()
-        # If the kernel has a header file declaration, but no corresponding
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-        # implementation can be found, this is not allowed
+                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-        if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
-            ${xpu_srcs_len} EQUAL 0)
+            endif()
-            message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        endif()
-        else()
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
        if (WITH_GPU)
            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -134,6 +137,20 @@ function(kernel_library TARGET)
                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
            endif()
        endif()
+    else()
+        if (${common_srcs_len} EQUAL 0)
+             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        else()
+            # If the kernel has a device independent public implementation,
+            # we will use this implementation and will not adopt the implementation
+            # under specific devices
+            if (WITH_GPU)
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            elseif (WITH_ROCM)
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            else()
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         endif()
    endif()

--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
      commonterm_for_range(commonterm_functor);
      commonterm_conj = helper.Transpose(commonterm_conj);
-      pten::AddKernel<T>(
+      pten::AddRawKernel<T>(
          static_cast<const typename paddle::framework::ConvertToPtenContext<
              DeviceContext>::TYPE &>(dev_ctx),
          commonterm, commonterm_conj, -1, &commonterm);

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::AddKernel<T>(
+    pten::AddRawKernel<T>(
        static_cast<const typename framework::ConvertToPtenContext<
            DeviceContext>::TYPE &>(dev_ctx),
        *pt_x.get(), *pt_y.get(), axis, pt_z.get());

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::DivideKernel<T>(
+    pten::DivideRawKernel<T>(
        static_cast<const typename framework::ConvertToPtenContext<
            DeviceContext>::TYPE&>(dev_ctx),
        *pt_x.get(), *pt_y.get(), axis, pt_z.get());

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -51,7 +51,7 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
                                 pt_z.get());
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
      auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
      auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
      auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(
+      pten::MultiplyRawKernel<T>(
          static_cast<const typename framework::ConvertToPtenContext<
              DeviceContext>::TYPE&>(dev_ctx),
          *pt_x.get(), *pt_y.get(), axis, pt_z.get());

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  framework::KernelSignature GetExpectedPtenKernelArgs(
      const framework::ExecutionContext &ctx) const override {
+    int axis = ctx.Attr<int>("axis");
    if (Type() == "elementwise_add") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("add", {"X", "Y"}, {"axis"}, {"Out"});
+        if (axis == -1) {
+          return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"},
+                                          {"Out"});
      }
    }
    if (Type() == "elementwise_sub") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("subtract", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("subtract", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"},
                                          {"Out"});
      }
    }
    if (Type() == "elementwise_div") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("divide", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"},
                                          {"Out"});
      }
    }
    if (Type() == "elementwise_mul") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("multiply", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("multiply", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"},
                                          {"Out"});
      }
    }

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::SubtractKernel<T>(
+    pten::SubtractRawKernel<T>(
        static_cast<const typename framework::ConvertToPtenContext<
            DeviceContext>::TYPE&>(dev_ctx),
        *pt_x.get(), *pt_y.get(), axis, pt_z.get());

--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
  out->Resize(src1.dims());
  out->mutable_data<T>(dev_ctx.GetPlace());
-  pten::AddKernel<
+  pten::AddRawKernel<
      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
      static_cast<const typename paddle::framework::ConvertToPtenContext<
          DeviceContext>::TYPE&>(dev_ctx),
@@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
  out->Resize(src1.dims());
  out->mutable_data<T>(dev_ctx.GetPlace());
-  pten::SubtractKernel<
+  pten::SubtractRawKernel<
      T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
      static_cast<const typename paddle::framework::ConvertToPtenContext<
          DeviceContext>::TYPE&>(dev_ctx),

--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel {
  framework::KernelSignature GetExpectedPtenKernelArgs(
      const framework::ExecutionContext& ctx) const override {
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
    if (Type() == "reduce_sum") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
          return framework::KernelSignature(
-            "sum", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
+              "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
+        }
+        return framework::KernelSignature(
+            "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
            {"Out"});
      }
    }
    if (Type() == "reduce_mean") {
      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
+          return framework::KernelSignature("mean", {"X"}, {"dim", "keep_dim"},
+                                            {"Out"});
+        }
        return framework::KernelSignature(
-            "mean", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+            "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
      }
    }
    // TODO(chentianyu03): support other cases after selected rows added

--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext;
 using add_kernel = void (*)(const DeviceContext&,
                            const DenseTensor&,
                            const DenseTensor&,
-                            int,
                            DenseTensor*);
 using cast_kernel = void (*)(const DeviceContext&,
@@ -46,7 +45,6 @@ using concat_kernel = void (*)(const DeviceContext&,
 using divide_kernel = void (*)(const DeviceContext&,
                               const DenseTensor&,
                               const DenseTensor&,
-                               int,
                               DenseTensor*);
 using dot_kernel = void (*)(const DeviceContext&,
@@ -82,13 +80,11 @@ using mean_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const std::vector<int64_t>&,
                             bool,
-                             bool,
                             DenseTensor*);
 using multiply_kernel = void (*)(const DeviceContext&,
                                 const DenseTensor&,
                                 const DenseTensor&,
-                                 int,
                                 DenseTensor*);
 using reshape_kernel = void (*)(const DeviceContext&,
@@ -107,14 +103,12 @@ using sum_kernel = void (*)(const DeviceContext&,
                            const DenseTensor&,
                            const std::vector<int64_t>&,
                            bool,
-                            bool,
                            DataType,
                            DenseTensor*);
 using subtract_kernel = void (*)(const DeviceContext&,
                                 const DenseTensor&,
                                 const DenseTensor&,
-                                 int,
                                 DenseTensor*);
 using conj_kernel = void (*)(const DeviceContext&,

--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -20,10 +20,10 @@ namespace pten {
 // the key is kernel_name in fluid, the value is the kernel_name in pten
 // the key is sorted by key's alphabet
 const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
-    {"elementwise_add", "add"},
+    {"elementwise_add", "add_raw"},
-    {"elementwise_div", "divide"},
+    {"elementwise_div", "divide_raw"},
-    {"elementwise_mul", "muliply"},
+    {"elementwise_mul", "muliply_raw"},
-    {"elementwise_sub", "subtract"},
+    {"elementwise_sub", "subtract_raw"},
    {"fill_any_like", "full_like"},
    {"fill_constant", "full"},
    {"flatten_contiguous_range", "flatten"},
@@ -32,8 +32,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
    {"matmul_v2_grad", "matmul_grad"},
    {"matmul_v2_grad_grad", "matmul_double_grad"},
    {"matmul_v2_triple_grad", "matmul_triple_grad"},
-    {"reduce_mean", "mean"},
+    {"reduce_mean", "mean_raw"},
-    {"reduce_sum", "sum"},
+    {"reduce_sum", "sum_raw"},
    {"reshape2", "reshape"},
    {"reshape2_grad", "reshape_grad"},
    {"reshape2_grad_grad", "reshape_double_grad"},

--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -32,7 +32,7 @@ namespace pten {
 #define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
  template <typename T, typename Context>                                   \
-  void name##Kernel(const Context& dev_ctx,                                 \
+  void name##RawKernel(const Context& dev_ctx,                              \
                       const DenseTensor& x,                                \
                       const DenseTensor& y,                                \
                       int axis,                                            \
@@ -55,7 +55,7 @@ namespace pten {
  }
 template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
+void MeanRawKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const std::vector<int64_t>& dims,
                   bool keep_dim,
@@ -67,7 +67,19 @@ void MeanKernel(const Context& dev_ctx,
 }
 template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
@@ -90,18 +102,6 @@ void DivideKernel(const Context& dev_ctx,
  }
 }
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               bool reduce_all,
-               DataType out_dtype,
-               DenseTensor* out) {
-  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
 // Create the definition of Add
 DEFINE_CPU_ELEMENTWISE_OP(Add)
@@ -118,42 +118,40 @@ using complex128 = ::paddle::platform::complex<double>;
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_KERNEL(
+PT_REGISTER_KERNEL(add_raw,
-    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
                   CPU,
                   ALL_LAYOUT,
-                   pten::AddKernel,
+                   pten::AddRawKernel,
                   float,
                   double,
                   int,
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PT_REGISTER_KERNEL(subtract_raw,
                   CPU,
                   ALL_LAYOUT,
-                   pten::SubtractKernel,
+                   pten::SubtractRawKernel,
                   float,
                   double,
                   int,
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(divide,
+PT_REGISTER_KERNEL(divide_raw,
                   CPU,
                   ALL_LAYOUT,
-                   pten::DivideKernel,
+                   pten::DivideRawKernel,
                   float,
                   double,
                   int,
                   int64_t,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PT_REGISTER_KERNEL(multiply_raw,
                   CPU,
                   ALL_LAYOUT,
-                   pten::MultiplyKernel,
+                   pten::MultiplyRawKernel,
                   float,
                   double,
                   int,
@@ -161,10 +159,10 @@ PT_REGISTER_KERNEL(multiply,
                   bool,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(sum,
+PT_REGISTER_KERNEL(sum_raw,
                   CPU,
                   ALL_LAYOUT,
-                   pten::SumKernel,
+                   pten::SumRawKernel,
                   bool,
                   float,
                   double,
@@ -175,3 +173,5 @@ PT_REGISTER_KERNEL(sum,
                   complex128) {
  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
+PT_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, pten::MeanRawKernel, float, double, bool) {}
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -37,7 +37,7 @@ namespace pten {
 #define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
  template <typename T, typename Context>                            \
-  void name##Kernel(const Context& dev_ctx,                          \
+  void name##RawKernel(const Context& dev_ctx,                       \
                       const DenseTensor& x,                         \
                       const DenseTensor& y,                         \
                       int axis,                                     \
@@ -57,7 +57,7 @@ namespace pten {
 */
 template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
+void MeanRawKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const std::vector<int64_t>& dims,
                   bool keep_dim,
@@ -68,17 +68,8 @@ void MeanKernel(const Context& dev_ctx,
      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
+void SumRawKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& dims,
                  bool keep_dim,
@@ -89,18 +80,25 @@ void SumKernel(const Context& dev_ctx,
      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 }  // namespace pten
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
-PT_REGISTER_KERNEL(
+PT_REGISTER_KERNEL(add_raw,
-    mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
-PT_REGISTER_KERNEL(add,
                   GPU,
                   ALL_LAYOUT,
-                   pten::AddKernel,
+                   pten::AddRawKernel,
                   float,
                   double,
                   int,
@@ -108,10 +106,10 @@ PT_REGISTER_KERNEL(add,
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PT_REGISTER_KERNEL(subtract_raw,
                   GPU,
                   ALL_LAYOUT,
-                   pten::SubtractKernel,
+                   pten::SubtractRawKernel,
                   float,
                   double,
                   int,
@@ -119,10 +117,10 @@ PT_REGISTER_KERNEL(subtract,
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(divide,
+PT_REGISTER_KERNEL(divide_raw,
                   GPU,
                   ALL_LAYOUT,
-                   pten::DivideKernel,
+                   pten::DivideRawKernel,
                   float,
                   double,
                   int,
@@ -130,10 +128,10 @@ PT_REGISTER_KERNEL(divide,
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PT_REGISTER_KERNEL(multiply_raw,
                   GPU,
                   ALL_LAYOUT,
-                   pten::MultiplyKernel,
+                   pten::MultiplyRawKernel,
                   float,
                   double,
                   int,
@@ -142,10 +140,10 @@ PT_REGISTER_KERNEL(multiply,
                   float16,
                   complex64,
                   complex128) {}
-PT_REGISTER_KERNEL(sum,
+PT_REGISTER_KERNEL(sum_raw,
                   GPU,
                   ALL_LAYOUT,
-                   pten::SumKernel,
+                   pten::SumRawKernel,
                   bool,
                   float,
                   double,
@@ -156,3 +154,12 @@ PT_REGISTER_KERNEL(sum,
                   complex128) {
  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
+PT_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16) {}
--- a/paddle/pten/kernels/math_kernel.cc
+++ b/paddle/pten/kernels/math_kernel.cc
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/kernels/math_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+namespace pten {
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DataType out_dtype,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+}  // namespace pten
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+PT_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
+PT_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PT_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   paddle::platform::float16) {}
+PT_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PT_REGISTER_KERNEL(add,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+#endif
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -22,16 +22,53 @@ limitations under the License. */
 namespace pten {
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
 template <typename T, typename Context>
 void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& dims,
                  bool keep_dim,
                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DataType out_dtype,
+               DenseTensor* out);
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
                  DenseTensor* out);
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& y,
                       int axis,
@@ -39,6 +76,12 @@ void AddKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
@@ -48,78 +91,58 @@ template <typename T, typename Context>
 void DivideKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const DenseTensor& y,
-                  int axis,
                  DenseTensor* out);
 template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
+void MultiplyRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& y,
                       int axis,
                       DenseTensor* out);
 template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
+void MultiplyKernel(const Context& dev_ctx,
                    const DenseTensor& x,
-               const std::vector<int64_t>& dims,
+                    const DenseTensor& y,
-               bool keep_dim,
-               bool reduce_all,
-               DataType out_dtype,
                    DenseTensor* out);
 template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                const DenseTensor& x,
-                const DenseTensor& y,
+                const DenseTensor& y) {
-                int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  pten::DenseTensor dense_out(
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  AddKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
  return dense_out;
 }
 template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                     const DenseTensor& x,
-                     const DenseTensor& y,
+                     const DenseTensor& y) {
-                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  pten::DenseTensor dense_out(
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  SubtractKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
  return dense_out;
 }
 template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                   const DenseTensor& x,
-                   const DenseTensor& y,
+                   const DenseTensor& y) {
-                   int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  pten::DenseTensor dense_out(
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  DivideKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
  return dense_out;
 }
 template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                     const DenseTensor& x,
-                     const DenseTensor& y,
+                     const DenseTensor& y) {
-                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  pten::DenseTensor dense_out(
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  MultiplyKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
  return dense_out;
 }
@@ -130,8 +153,7 @@ DenseTensor Mean(const Context& dev_ctx,
                 bool keep_dim) {
  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  bool reduce_all = false;
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
  return dense_out;
 }
@@ -144,12 +166,7 @@ DenseTensor Sum(const Context& dev_ctx,
  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  // The real value of reduce_all will be get in kernel
+  SumKernel<T, Context>(dev_ctx, x, axis, keep_dim, dtype, &dense_out);
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-  SumKernel<T, Context>(
-      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
  return dense_out;
 }

--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -54,11 +54,10 @@ TEST(DEV_API, add) {
  for (size_t i = 0; i < 10; ++i) {
    dense_y_data[i] = i * 2.0;
  }
-  int axis = 1;
  // 2. test API
  pten::CPUContext dev_ctx;
-  auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
  // 3. check result
  ASSERT_EQ(dense_out.dims().size(), 2);
@@ -101,11 +100,10 @@ TEST(DEV_API, subtract) {
  for (size_t i = 0; i < 10; ++i) {
    dense_y_data[i] = i * 2.0;
  }
-  int axis = 1;
  // 2. test API
  pten::CPUContext dev_ctx;
-  auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
  // 3. check result
  ASSERT_EQ(dense_out.dims().size(), 2);
@@ -148,11 +146,10 @@ TEST(DEV_API, divide) {
  for (size_t i = 0; i < 10; ++i) {
    dense_y_data[i] = i * 2.0 + 1;
  }
-  int axis = 1;
  // 2. test API
  pten::CPUContext dev_ctx;
-  auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
  // 3. check result
  ASSERT_EQ(dense_out.dims().size(), 2);
@@ -195,11 +192,10 @@ TEST(DEV_API, multiply) {
  for (size_t i = 0; i < 10; ++i) {
    dense_y_data[i] = i * 2.0;
  }
-  int axis = 1;
  // 2. test API
  pten::CPUContext dev_ctx;
-  auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
  // 3. check result
  ASSERT_EQ(dense_out.dims().size(), 2);

--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -6,7 +6,6 @@
    param : [x, y, -1]
  kernel :
    func : add
-    param : [x, y, -1]
 - api : cast
  args : (const Tensor& x, DataType out_dtype)
@@ -44,7 +43,6 @@
    param : [x, y, -1]
  kernel :
    func : divide
-    param : [x, y, -1]
 - api : dot
  args : (const Tensor& x, const Tensor& y)
@@ -130,7 +128,6 @@
    param: [x, axis, keep_dim]
  kernel : 
    func : mean
-    param : [x, axis, keep_dim, false]
 - api : multiply
  args : (const Tensor& x, const Tensor& y)
@@ -140,7 +137,6 @@
    param : [x, y, -1]
  kernel :
    func : multiply
-    param : [x, y, -1]
 - api : ones_like
  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
@@ -172,7 +168,6 @@
    param : [x, y, -1]
  kernel :
    func : subtract
-    param : [x, y, -1]
 - api : sum
  args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
@@ -182,7 +177,7 @@
    param: [x, axis, keep_dim, dtype]
  kernel : 
    func : sum
-    param : [x, axis, keep_dim, false, DataType::UNDEFINED]
+    param : [x, axis, keep_dim, dtype]
    data_type : x
 - api : zeros_like