diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 9acc8cc8db97bf1a93f0cc214527f011d08aa6a5..de67958d5fe91eb0be0a4e5151b59599c7084162 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -111,6 +111,7 @@ file(
   "gpu/*.cu.cc"
   "gpudnn/*.cu"
   "kps/*.cu"
+  "legacy/kps/*.cu"
   "selected_rows/gpu/*.cu"
   "sparse/gpu/*.cu"
   "strings/gpu/*.cu"
@@ -152,6 +153,8 @@ if(WITH_MKLDNN)
     kernel_cc
     "*.cc"
     "cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
     "selected_rows/*.cc"
     "selected_rows/cpu/*.cc"
     "sparse/*.cc"
@@ -168,6 +171,8 @@ else()
     kernel_cc
     "*.cc"
     "cpu/*.cc"
+    "legacy/*.cc"
+    "legacy/cpu/*.cc"
     "selected_rows/*.cc"
     "selected_rows/cpu/*.cc"
     "sparse/*.cc"
@@ -178,7 +183,8 @@ else()
     "fusion/cpu/*.cc")
 endif()
 
-file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")
+file(GLOB kernel_xpu "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc"
+     "fusion/xpu/*.cc")
 
 if(WITH_MKLDNN)
   set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
@@ -201,6 +207,8 @@ elseif(WITH_XPU)
   if(WITH_XPU_KP)
     file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
          DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/
+         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
     file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu")
     foreach(kernel ${kernel_xpu_kps})
       get_filename_component(name ${kernel} NAME_WE)
@@ -212,6 +220,8 @@ elseif(WITH_XPU)
       RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
       "*.cc"
       "cpu/*.cc"
+      "legacy/*.cc"
+      "legacy/cpu/*.cc"
       "selected_rows/*.cc"
       "selected_rows/cpu/*.cc"
       "sparse/*.cc"
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 11aac8bbfe3ad37749d1098d81a977db6aaffd2e..9b564679b354e38b157d6b1924aeda4a55d2e6e4 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
@@ -22,84 +23,48 @@
 namespace phi {
 
 template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::MaximumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MaximumFunctor<T>(), out);
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  funcs::ElementwiseCompute<funcs::MinimumFunctor<T>, T>(
-      dev_ctx, x, y, axis, funcs::MinimumFunctor<T>(), out);
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::RemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::RemainderFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::InverseRemainderFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseRemainderFunctor<T>(), out);
-  }
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
+  int axis = -1;
+  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::FloorDivideFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::InverseFloorDivideFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::InverseFloorDivideFunctor<T>(), out);
-  }
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
-  } else {
-    funcs::ElementwiseCompute<funcs::ElementwiseInversePowFunctor<T>, T>(
-        dev_ctx, x, y, axis, funcs::ElementwiseInversePowFunctor<T>(), out);
-  }
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
@@ -127,42 +92,38 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(maximum_raw,
+PD_REGISTER_KERNEL(maximum,
                    CPU,
                    ALL_LAYOUT,
-                   phi::MaximumRawKernel,
+                   phi::MaximumKernel,
                    float,
                    double,
                    int,
                    int64_t,
                    phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+PD_REGISTER_KERNEL(minimum,
                    CPU,
                    ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::MinimumKernel,
                    float,
                    double,
                    int,
                    int64_t,
                    phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+PD_REGISTER_KERNEL(remainder,
                    CPU,
                    ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::RemainderKernel,
                    float,
                    double,
                    int,
                    int64_t) {}
-PD_REGISTER_KERNEL(floor_divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+PD_REGISTER_KERNEL(
+    floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::ElementwisePowKernel,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 98d76c2d944f3d4219c3493b3ed4c06405d58359..0a2a15abd3d75e90cb28622751c3f280cb0d1c15 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -23,51 +23,6 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void MaximumKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   DenseTensor* out) {
-  int axis = -1;
-  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void MinimumKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   DenseTensor* out) {
-  int axis = -1;
-  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void RemainderKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     DenseTensor* out) {
-  int axis = -1;
-  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void FloorDivideKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       DenseTensor* out) {
-  int axis = -1;
-  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void ElementwisePowKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          DenseTensor* out) {
-  int axis = -1;
-  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
 template <typename T, typename Context>
 void DivideKernel(const Context& dev_ctx,
                   const DenseTensor& x,
@@ -105,44 +60,6 @@ void SubtractKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(maximum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MaximumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MinimumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(
-    floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16) {}
-
 PD_REGISTER_KERNEL(subtract,
                    CPU,
                    ALL_LAYOUT,
@@ -192,52 +109,6 @@ PD_REGISTER_KERNEL(divide,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-PD_REGISTER_KERNEL(maximum,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MaximumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(minimum,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MinimumKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(remainder,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
-
-#endif
-
 #if defined(PADDLE_WITH_XPU_KP) && defined(PADDLE_WITH_XPU)
 PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
@@ -329,29 +200,3 @@ PD_REGISTER_KERNEL(subtract,
                    phi::dtype::float16,
                    int64_t) {}
 #endif
-#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
-PD_REGISTER_KERNEL(floor_divide,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    maximum, XPU, ALL_LAYOUT, phi::MaximumKernel, float, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    minimum, XPU, ALL_LAYOUT, phi::MinimumKernel, float, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(remainder,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::RemainderKernel,
-                   float,
-                   phi::dtype::float16,
-                   int32_t,
-                   int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::ElementwisePowKernel,
-                   float,
-                   phi::dtype::float16) {}
-#endif
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 3bc4163d59e71f2c61b0d684e660fdd12bbcf5fb..5c01639dd7cc2e359ade1311f625563f80b5e1d9 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -31,65 +31,30 @@ void FMinKernel(const Context& dev_ctx,
                 const DenseTensor& y,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
                    DenseTensor* out);
 
-template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out);
-
 template <typename T, typename Context>
 void MinimumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
                    DenseTensor* out);
 
-template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out);
-
 template <typename T, typename Context>
 void RemainderKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      DenseTensor* out);
 
-template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out);
-
 template <typename T, typename Context>
 void FloorDivideKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        DenseTensor* out);
 
-template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out);
-
 template <typename T, typename Context>
 void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& x,
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 245137943d5710141b04d679505ff319624147d1..80a969c4fabb4ee0fdb123303b7321906594af1a 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -19,11 +19,10 @@
 #endif
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 
 namespace phi {
 
-// Create the definition of Maximum
-DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -33,8 +32,6 @@ void MaximumKernel(const Context& dev_ctx,
   MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
-// Create the definition of Minimum
-DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
 template <typename T, typename Context>
 void MinimumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -43,10 +40,16 @@ void MinimumKernel(const Context& dev_ctx,
   int axis = -1;
   MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
-// Create the definition of Remainder
-DEFINE_CUDA_ELEMENTWISE_OP(Remainder)
-// Create the definition of FloorDivide
-DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+
+template <typename T, typename Context>
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
+  int axis = -1;
+  RemainderRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
 template <typename T, typename Context>
 void FloorDivideKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -73,8 +76,6 @@ void HeavisideKernel(const Context& dev_ctx,
       dev_ctx, inputs, &outputs, -1, funcs::ElementwiseHeavisideFunctor<T>());
 }
 
-// Create the definition of Pow
-DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 template <typename T, typename Context>
 void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& x,
@@ -86,101 +87,93 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
-PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
-}
-PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
-PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
-}
-PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
-}
-PD_REGISTER_KERNEL(
-    floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
-PD_REGISTER_KERNEL(
-    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
-PD_REGISTER_KERNEL(
-    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
-}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#else
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(fmax,
+PD_REGISTER_KERNEL(maximum,
                    KPS,
                    ALL_LAYOUT,
-                   phi::FMaxKernel,
+                   phi::MaximumKernel,
                    float,
                    double,
                    int,
-                   float16,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(fmin,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(minimum,
                    KPS,
                    ALL_LAYOUT,
-                   phi::FMinKernel,
+                   phi::MinimumKernel,
                    float,
                    double,
                    int,
-                   float16,
-                   int64_t) {}
-
-PD_REGISTER_KERNEL(maximum_raw,
-                   KPS,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(remainder,
+                   GPU,
                    ALL_LAYOUT,
-                   phi::MaximumRawKernel,
+                   phi::RemainderKernel,
                    float,
                    double,
                    int,
                    int64_t,
-                   float16,
-                   bfloat16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
                    KPS,
                    ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::ElementwisePowKernel,
                    float,
                    double,
                    int,
                    int64_t,
-                   float16,
-                   bfloat16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+
+#else
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(fmax,
                    KPS,
                    ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::FMaxKernel,
                    float,
                    double,
                    int,
                    float16,
                    int64_t) {}
-PD_REGISTER_KERNEL(floor_divide_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   int,
-                   int64_t) {}
-PD_REGISTER_KERNEL(heaviside,
+
+PD_REGISTER_KERNEL(fmin,
                    KPS,
                    ALL_LAYOUT,
-                   phi::HeavisideKernel,
+                   phi::FMinKernel,
                    float,
                    double,
                    int,
                    float16,
                    int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+
+PD_REGISTER_KERNEL(heaviside,
                    KPS,
                    ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::HeavisideKernel,
                    float,
                    double,
                    int,
                    float16,
-                   bfloat16,
                    int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
index 6d1f8701c3d3daac05a4413fda59fcb396689e6b..a976cb2a0093379f0a65b3d65a764d60afe3ee64 100644
--- a/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/elementwise_kernel.cc
@@ -143,4 +143,5 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/legacy/elementwise_kernel.h b/paddle/phi/kernels/legacy/elementwise_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b51704da7a6d61a955b3620b4c30a3f71217deeb
--- /dev/null
+++ b/paddle/phi/kernels/legacy/elementwise_kernel.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaximumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void MinimumRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      int axis,
+                      DenseTensor* out);
+
+template <typename T, typename Context>
+void RemainderRawKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        int axis,
+                        DenseTensor* out);
+
+template <typename T, typename Context>
+void FloorDivideRawKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          int axis,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void ElementwisePowRawKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int axis,
+                             DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
index ec856ffa53b094367b45f2c6a0c838aec992dee8..95cf5d4333e8d8f251a5c490be57311ca9a0f3ff 100644
--- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -172,5 +172,6 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                    double,
                    int,
                    float16,
-                   int64_t) {}
+                   int64_t,
+                   bfloat16) {}
 #endif
diff --git a/paddle/phi/kernels/xpu/elementwise_kernel.cc b/paddle/phi/kernels/xpu/elementwise_kernel.cc
index f70f9e743a41147d9ff91cd9ea351aeadeee75af..386ad2e13ff0edffb6174f6d7e5e6e7eacc7a791 100644
--- a/paddle/phi/kernels/xpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 #include "paddle/phi/kernels/xpu/elementwise.h"
 
 #include "paddle/phi/backends/xpu/xpu_context.h"
@@ -21,68 +22,37 @@
 namespace phi {
 
 template <typename T, typename Context>
-void FloorDivideRawKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          int axis,
-                          DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void MaximumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void MinimumRawKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      int axis,
-                      DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void RemainderRawKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        DenseTensor* out) {
+void RemainderKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   auto f = [](xpu::Context* ctx,
               const XPUType* x,
@@ -93,59 +63,41 @@ void RemainderRawKernel(const Context& dev_ctx,
     return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, -1, out, f);
 }
 
 template <typename T, typename Context>
-void ElementwisePowRawKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int axis,
-                             DenseTensor* out) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
-              const XPUType* x,
-              const XPUType* y,
-              XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
-    return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
-  };
-
-  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(floor_divide_raw,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::FloorDivideRawKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(maximum_raw,
-                   XPU,
-                   ALL_LAYOUT,
-                   phi::MaximumRawKernel,
-                   float,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(minimum_raw,
+PD_REGISTER_KERNEL(floor_divide,
                    XPU,
                    ALL_LAYOUT,
-                   phi::MinimumRawKernel,
+                   phi::FloorDivideKernel,
                    float,
                    phi::dtype::float16) {}
-PD_REGISTER_KERNEL(remainder_raw,
+PD_REGISTER_KERNEL(
+    maximum, XPU, ALL_LAYOUT, phi::MaximumKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    minimum, XPU, ALL_LAYOUT, phi::MinimumKernel, float, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(remainder,
                    XPU,
                    ALL_LAYOUT,
-                   phi::RemainderRawKernel,
+                   phi::RemainderKernel,
                    float,
                    phi::dtype::float16,
                    int32_t,
                    int64_t) {}
-PD_REGISTER_KERNEL(elementwise_pow_raw,
+PD_REGISTER_KERNEL(elementwise_pow,
                    XPU,
                    ALL_LAYOUT,
-                   phi::ElementwisePowRawKernel,
+                   phi::ElementwisePowKernel,
                    float,
                    phi::dtype::float16) {}