move_elementwise_raw (#53010)

* setup * Update elementwise_kernel.cc * Update elementwise_kernel.cc * fix * fix * Update elementwise_kernel.cu * fix * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc

move_elementwise_raw (#53010)
* setup * Update elementwise_kernel.cc * Update elementwise_kernel.cc * fix * fix * Update elementwise_kernel.cu * fix * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc * Update elementwise_kernel.cc
7a72f7a2 · zhangyuqin1998 · GitHub · 06ecc6d2 · 7a72f7a2 · 7a72f7a2
9 changed file
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -111,6 +111,7 @@ file(
  "gpu/*.cu.cc"
  "gpudnn/*.cu"
  "kps/*.cu"
+  "legacy/kps/*.cu"
  "selected_rows/gpu/*.cu"
  "sparse/gpu/*.cu"
  "strings/gpu/*.cu"
@@ -152,6 +153,8 @@ if(WITH_MKLDNN)
    kernel_cc
    "*.cc"
    "cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
    "selected_rows/*.cc"
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
@@ -168,6 +171,8 @@ else()
    kernel_cc
    "*.cc"
    "cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
    "selected_rows/*.cc"
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
@@ -178,7 +183,8 @@ else()
    "fusion/cpu/*.cc")
 endif()

-file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")
+file(GLOB kernel_xpu "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc"
+     "fusion/xpu/*.cc")

 if(WITH_MKLDNN)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
@@ -201,6 +207,8 @@ elseif(WITH_XPU)
  if(WITH_XPU_KP)
    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/
+         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu")
    foreach(kernel ${kernel_xpu_kps})
      get_filename_component(name ${kernel} NAME_WE)
@@ -212,6 +220,8 @@ elseif(WITH_XPU)
      RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
      "*.cc"
      "cpu/*.cc"
+      "legacy/*.cc"
+      "legacy/cpu/*.cc"
      "selected_rows/*.cc"
      "selected_rows/cpu/*.cc"
      "sparse/*.cc"

--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -22,84 +23,48 @@
 namespace phi {

 template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::MaximumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MaximumFunctor<T>(), out);
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::MinimumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MinimumFunctor<T>(), out);
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::RemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::RemainderFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::InverseRemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseRemainderFunctor<T>(), out);
-  }
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
+  int axis = -1;
+  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::FloorDivideFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::InverseFloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseFloorDivideFunctor<T>(), out);
-  }
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::ElementwiseInversePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor<T>(), out);
-  }
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
@@ -127,42 +92,38 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
    fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}

-PD_REGISTER_KERNEL(maximum_raw,
+PD_REGISTER_KERNEL(maximum,
                   CPU,
                   ALL_LAYOUT,
-                   phi::MaximumRawKernel,
+                   phi::MaximumKernel,
                   float,
                   double,
                   int,
                   int64_t,
                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+PD_REGISTER_KERNEL(minimum,
                   CPU,
                   ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::MinimumKernel,
                   float,
                   double,
                   int,
                   int64_t,
                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+PD_REGISTER_KERNEL(remainder,
                   CPU,
                   ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::RemainderKernel,
                   float,
                   double,
                   int,
                   int64_t) {}
-PD_REGISTER_KERNEL(floor_divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+PD_REGISTER_KERNEL(
+    floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
                   CPU,
                   ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::ElementwisePowKernel,
                   float,
                   double,
                   int,

--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -23,51 +23,6 @@

 namespace phi {

-template <typename T, typename Context>
-void MaximumKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   DenseTensor* out) {
-  int axis = -1;
-  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void MinimumKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   DenseTensor* out) {
-  int axis = -1;
-  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void RemainderKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     DenseTensor* out) {
-  int axis = -1;
-  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void FloorDivideKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       DenseTensor* out) {
-  int axis = -1;
-  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void ElementwisePowKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          DenseTensor* out) {
-  int axis = -1;
-  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
 template <typename T, typename Context>
 void DivideKernel(const Context& dev_ctx,
                  const DenseTensor& x,
@@ -105,44 +60,6 @@ void SubtractKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;

-PD_REGISTER_KERNEL(maximum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MaximumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MinimumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(
-    floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_KERNEL(subtract,
                   CPU,
                   ALL_LAYOUT,
@@ -192,52 +109,6 @@ PD_REGISTER_KERNEL(divide,
                   complex64,
                   complex128) {}

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-PD_REGISTER_KERNEL(maximum,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MaximumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MinimumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
-#endif
-
 #if defined(PADDLE_WITH_XPU_KP) && defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
@@ -329,29 +200,3 @@ PD_REGISTER_KERNEL(subtract,
                   phi::dtype::float16,
                   int64_t) {}
 #endif
-#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
-PD_REGISTER_KERNEL(floor_divide,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    maximum, XPU, ALL_LAYOUT, phi::MaximumKernel, float, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    minimum, XPU, ALL_LAYOUT, phi::MinimumKernel, float, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(remainder,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   phi::dtype::float16,
-                   int32_t,
-                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   phi::dtype::float16) {}
-#endif
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -31,65 +31,30 @@ void FMinKernel(const Context& dev_ctx,
                const DenseTensor& y,
                DenseTensor* out);

-template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* out);

-template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void MinimumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
                   DenseTensor* out);

-template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out);
-
 template <typename T, typename Context>
 void RemainderKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     DenseTensor* out);

-template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out);
-
 template <typename T, typename Context>
 void FloorDivideKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& y,
                       DenseTensor* out);

-template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out);
-
 template <typename T, typename Context>
 void ElementwisePowKernel(const Context& dev_ctx,
                          const DenseTensor& x,

--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -19,11 +19,10 @@
 #endif
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"

 namespace phi {

-// Create the definition of Maximum
-DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
@@ -33,8 +32,6 @@ void MaximumKernel(const Context& dev_ctx,
  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }

-// Create the definition of Minimum
-DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
 template <typename T, typename Context>
 void MinimumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
@@ -43,10 +40,16 @@ void MinimumKernel(const Context& dev_ctx,
  int axis = -1;
  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
-// Create the definition of Remainder
-DEFINE_CUDA_ELEMENTWISE_OP(Remainder)
-// Create the definition of FloorDivide
-DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+
+template <typename T, typename Context>
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
+  int axis = -1;
+  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
 template <typename T, typename Context>
 void FloorDivideKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -73,8 +76,6 @@ void HeavisideKernel(const Context& dev_ctx,
      dev_ctx, inputs, &outputs, -1, funcs::ElementwiseHeavisideFunctor<T>());
 }

-// Create the definition of Pow
-DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 template <typename T, typename Context>
 void ElementwisePowKernel(const Context& dev_ctx,
                          const DenseTensor& x,
@@ -86,101 +87,93 @@ void ElementwisePowKernel(const Context& dev_ctx,

 }  // namespace phi

-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
-PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
-}
-PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
-PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
-}
-PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
-}
-PD_REGISTER_KERNEL(
-    floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
-PD_REGISTER_KERNEL(
-    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
-PD_REGISTER_KERNEL(
-    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
-}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

-#else
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(fmax,
+PD_REGISTER_KERNEL(maximum,
                   KPS,
                   ALL_LAYOUT,
-                   phi::FMaxKernel,
+                   phi::MaximumKernel,
                   float,
                   double,
                   int,
-                   float16,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(fmin,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(minimum,
                   KPS,
                   ALL_LAYOUT,
-                   phi::FMinKernel,
+                   phi::MinimumKernel,
                   float,
                   double,
                   int,
-                   float16,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(maximum_raw,
-                   KPS,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(remainder,
+                   GPU,
                   ALL_LAYOUT,
-                   phi::MaximumRawKernel,
+                   phi::RemainderKernel,
                   float,
                   double,
                   int,
                   int64_t,
-                   float16,
-                   bfloat16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
                   KPS,
                   ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::ElementwisePowKernel,
                   float,
                   double,
                   int,
                   int64_t,
-                   float16,
-                   bfloat16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+
+#else
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(fmax,
                   KPS,
                   ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::FMaxKernel,
                   float,
                   double,
                   int,
                   float16,
                   int64_t) {}
-PD_REGISTER_KERNEL(floor_divide_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(heaviside,
+
+PD_REGISTER_KERNEL(fmin,
                   KPS,
                   ALL_LAYOUT,
-                   phi::HeavisideKernel,
+                   phi::FMinKernel,
                   float,
                   double,
                   int,
                   float16,
                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+
+PD_REGISTER_KERNEL(heaviside,
                   KPS,
                   ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::HeavisideKernel,
                   float,
                   double,
                   int,
                   float16,
-                   bfloat16,
                   int64_t) {}
 #endif
--- a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
@@ -143,4 +143,5 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                   float,
                   double,
                   int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/legacy/elementwise_kernel.h
+++ b/paddle/phi/kernels/legacy/elementwise_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaximumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void MinimumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void RemainderRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        int axis,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void FloorDivideRawKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          int axis,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void ElementwisePowRawKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int axis,
+                             DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -172,5 +172,6 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                   double,
                   int,
                   float16,
-                   int64_t) {}
+                   int64_t,
+                   bfloat16) {}
 #endif
--- a/paddle/phi/kernels/xpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 #include "paddle/phi/kernels/xpu/elementwise.h"

 #include "paddle/phi/backends/xpu/xpu_context.h"
@@ -21,68 +22,37 @@
 namespace phi {

 template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out) {
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
  auto f = [](xpu::Context* ctx,
              const XPUType* x,
@@ -93,59 +63,41 @@ void RemainderRawKernel(const Context& dev_ctx,
    return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
  };

-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, -1, out, f);
 }

 template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }

 }  // namespace phi

-PD_REGISTER_KERNEL(floor_divide_raw,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(maximum_raw,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::MaximumRawKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+PD_REGISTER_KERNEL(floor_divide,
                   XPU,
                   ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::FloorDivideKernel,
                   float,
                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+PD_REGISTER_KERNEL(
+    maximum, XPU, ALL_LAYOUT, phi::MaximumKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    minimum, XPU, ALL_LAYOUT, phi::MinimumKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(remainder,
                   XPU,
                   ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::RemainderKernel,
                   float,
                   phi::dtype::float16,
                   int32_t,
                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+PD_REGISTER_KERNEL(elementwise_pow,
                   XPU,
                   ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::ElementwisePowKernel,
                   float,
                   phi::dtype::float16) {}