From 2cb19d8fdc9651c2d3f9250163412b1a063c11aa Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 19 Apr 2022 11:53:16 +0800
Subject: [PATCH] =?UTF-8?q?[Phi]Separate=20AddKernel/DivideKernel/Subtract?=
 =?UTF-8?q?Kernel/MultiplyKernel=20from=20ElementwiseKernel=EF=BC=88Part1?=
 =?UTF-8?q?=EF=BC=89=20(#41806)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* seperate add/div/sub/mul from elementwise

* delete code

* fix compile bugs

* deal with conflict

* fix bugs when compile

* fix windows unit test bug

* fix ci converage bugs
---
 .../performance_tests/benchmark_eager_cuda.cc |   2 +-
 .../performance_tests/benchmark_fluid_cuda.cc |   2 +-
 .../tests/task_tests/fwd_bwd_joint_test.cc    |   2 +-
 .../new_executor/standalone_executor_test.cc  |   9 +-
 paddle/fluid/imperative/tests/test_tracer.cc  |   1 +
 .../cinn/cinn_instruction_run_op_test.cc      |   2 +-
 .../operators/cinn/cinn_launch_op_test.cc     |   2 +-
 paddle/fluid/operators/determinant_op.h       |   2 +-
 paddle/fluid/operators/eig_op.h               |   4 +-
 .../elementwise/elementwise_add_op.h          |   4 +-
 .../test_elementwise_add_op_inplace.cc        |   3 +
 paddle/fluid/operators/feed_forward_test.cu   |   1 +
 paddle/fluid/operators/lu_op.h                |   3 +-
 .../cpu/elementwise_add_grad_kernel.cc        | 121 +++++++++
 .../phi/kernels/cpu/elementwise_add_kernel.cc |  67 +++++
 .../cpu/elementwise_divide_grad_kernel.cc     |  62 +++++
 .../kernels/cpu/elementwise_divide_kernel.cc  |  85 ++++++
 .../kernels/cpu/elementwise_grad_kernel.cc    | 235 ----------------
 paddle/phi/kernels/cpu/elementwise_kernel.cc  | 101 -------
 .../cpu/elementwise_multiply_grad_kernel.cc   |  79 ++++++
 .../cpu/elementwise_multiply_kernel.cc        |  69 +++++
 .../cpu/elementwise_subtract_grad_kernel.cc   |  75 ++++++
 .../cpu/elementwise_subtract_kernel.cc        |  68 +++++
 .../phi/kernels/cpu/matrix_rank_tol_kernel.cc |   2 +-
 .../phi/kernels/elementwise_add_grad_kernel.h |  49 ++++
 paddle/phi/kernels/elementwise_add_kernel.h   |  45 ++++
 .../kernels/elementwise_divide_grad_kernel.h  |  44 +++
 .../phi/kernels/elementwise_divide_kernel.h   |  46 ++++
 paddle/phi/kernels/elementwise_grad_kernel.h  | 105 --------
 paddle/phi/kernels/elementwise_kernel.cc      | 132 ---------
 paddle/phi/kernels/elementwise_kernel.h       |  96 -------
 .../elementwise_multiply_grad_kernel.h        |  60 +++++
 .../phi/kernels/elementwise_multiply_kernel.h |  46 ++++
 .../elementwise_subtract_grad_kernel.h        |  39 +++
 .../phi/kernels/elementwise_subtract_kernel.h |  46 ++++
 .../gpu/elementwise_add_grad_kernel.cu        | 118 ++++++++
 .../gpu/elementwise_divide_grad_kernel.cu     |  86 ++++++
 .../kernels/gpu/elementwise_grad_kernel.cu    | 254 ------------------
 .../gpu/elementwise_multiply_grad_kernel.cu   |  82 ++++++
 .../gpu/elementwise_subtract_grad_kernel.cu   |  83 ++++++
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu |   2 +-
 .../impl/cholesky_solve_grad_kernel_impl.h    |   2 +-
 .../impl/determinant_grad_kernel_impl.h       |   2 +-
 .../phi/kernels/impl/eigh_grad_kernel_impl.h  |   4 +-
 .../kernels/impl/elementwise_kernel_impl.h    |  42 +++
 .../phi/kernels/kps/elementwise_add_kernel.cu |  73 +++++
 .../kernels/kps/elementwise_divide_kernel.cu  |  73 +++++
 paddle/phi/kernels/kps/elementwise_kernel.cu  |  85 ------
 .../kps/elementwise_multiply_kernel.cu        |  75 ++++++
 .../kps/elementwise_subtract_kernel.cu        |  75 ++++++
 ...rnel.cc => elementwise_multiply_kernel.cc} |   4 +-
 ...kernel.h => elementwise_multiply_kernel.h} |   0
 paddle/phi/tests/api/test_elementwise_api.cc  |   3 +
 .../tests/kernels/test_elementwise_dev_api.cc |   5 +-
 54 files changed, 1748 insertions(+), 1029 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_add_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
 create mode 100644 paddle/phi/kernels/elementwise_add_grad_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_add_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_divide_grad_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_divide_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_multiply_grad_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_multiply_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_subtract_grad_kernel.h
 create mode 100644 paddle/phi/kernels/elementwise_subtract_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/kps/elementwise_add_kernel.cu
 create mode 100644 paddle/phi/kernels/kps/elementwise_divide_kernel.cu
 create mode 100644 paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
 create mode 100644 paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
 rename paddle/phi/kernels/selected_rows/{elementwise_kernel.cc => elementwise_multiply_kernel.cc} (96%)
 rename paddle/phi/kernels/selected_rows/{elementwise_kernel.h => elementwise_multiply_kernel.h} (100%)

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 5e79038981..287d6e770d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -42,7 +42,7 @@ using namespace egr_utils_api;  // NOLINT
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index a3e393b039..d9afd7cc96 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -41,7 +41,7 @@
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index d2bef100ca..2d69380cf7 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace egr {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 5efd0fb420..e03277fb31 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -69,14 +69,17 @@ PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
-PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 #else
-PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
 #endif
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 1c3a04b51a..7bfb3094ba 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -38,6 +38,7 @@ PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 358d0fc6d0..68bc3a0eb5 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -30,7 +30,7 @@ USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 3e363c56eb..4f922945ea 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -36,7 +36,7 @@ DECLARE_bool(enable_pe_launch_cinn);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index a1fe8a2566..702ff3bfd8 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 6daf05a9d7..fe898a6c41 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,7 +21,9 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index c28abb916b..d77d4ed036 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 // only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index ce5c6b701d..f28aae9eed 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -26,6 +26,9 @@
 USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index e5ebdad1e4..61b80219a2 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -33,6 +33,7 @@ USE_OP_ITSELF(elementwise_add);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 // get paddle matmul op results as baseline
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index d6170b7000..8ef3d60c0d 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
new file mode 100644
index 0000000000..f8a89b997b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+        dev_ctx,
+        x,
+        y,
+        out,
+        dout,
+        axis,
+        dx,
+        dy,
+        IdentityGrad<T>(),
+        IdentityGrad<T>());
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
new file mode 100644
index 0000000000..6070264547
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
new file mode 100644
index 0000000000..b6541ec0e6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
new file mode 100644
index 0000000000..d380621818
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index f452d9ffb7..3f5e0b8a4d 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -23,118 +23,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const CPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-        dev_ctx,
-        x,
-        y,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        IdentityGrad<T>(),
-        IdentityGrad<T>());
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
-      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  auto* out = &dout;  // out is not necessary
-  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
-      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -163,129 +51,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
 PD_REGISTER_KERNEL(fmax_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index a91ca1ee32..7478f69d91 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -21,54 +21,6 @@
 
 namespace phi {
 
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -142,14 +94,6 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
   funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
       dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
 }
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
 
 }  // namespace phi
 
@@ -165,51 +109,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
new file mode 100644
index 0000000000..6055541c80
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
new file mode 100644
index 0000000000..2424a53301
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
new file mode 100644
index 0000000000..c785eacb9a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -0,0 +1,75 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
new file mode 100644
index 0000000000..0e97852ac3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 77c7631710..3bfc07319e 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,7 +17,7 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/elementwise_add_grad_kernel.h b/paddle/phi/kernels/elementwise_add_grad_kernel.h
new file mode 100644
index 0000000000..9b754cfefe
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy);
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_add_kernel.h b/paddle/phi/kernels/elementwise_add_kernel.h
new file mode 100644
index 0000000000..3245c450aa
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
new file mode 100644
index 0000000000..6d29dae99a
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_kernel.h b/paddle/phi/kernels/elementwise_divide_kernel.h
new file mode 100644
index 0000000000..5555b69fde
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index 0e730fbfbf..6f2f2915ec 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -19,111 +19,6 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy);
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout);
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy);
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy);
-
-template <typename T, typename Context>
-void DivideDoubleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
-                            int axis,
-                            DenseTensor* dy,
-                            DenseTensor* dout,
-                            DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void MultiplyDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              int axis,
-                              DenseTensor* dx,
-                              DenseTensor* dy,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyTripleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& d_dx,
-                              const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
-                              int axis,
-                              DenseTensor* d_x,
-                              DenseTensor* d_y,
-                              DenseTensor* d_dout,
-                              DenseTensor* d_ddx,
-                              DenseTensor* d_ddy);
-
 template <typename T, typename Context>
 void ElementwiseFMaxGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 6cd602e47b..4cee24d2f8 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -19,42 +19,6 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out) {
-  int axis = -1;
-  AddRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out) {
-  int axis = -1;
-  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -105,51 +69,6 @@ void ElementwisePowKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum,
                    CPU,
                    ALL_LAYOUT,
@@ -183,57 +102,6 @@ PD_REGISTER_KERNEL(elementwise_pow,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-PD_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
 PD_REGISTER_KERNEL(maximum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 09b6b02e37..37fe895d40 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -33,58 +33,6 @@ void FMinKernel(const Context& dev_ctx,
                 int axis,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -150,50 +98,6 @@ void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& y,
                           DenseTensor* out);
 
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename Context>
 DenseTensor Maximum(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
new file mode 100644
index 0000000000..517948a50d
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_multiply_kernel.h b/paddle/phi/kernels/elementwise_multiply_kernel.h
new file mode 100644
index 0000000000..608ae95d2b
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
new file mode 100644
index 0000000000..7be91b4b9f
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_kernel.h b/paddle/phi/kernels/elementwise_subtract_kernel.h
new file mode 100644
index 0000000000..1f6c4383df
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
new file mode 100644
index 0000000000..8dd4d0184c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -0,0 +1,118 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const GPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
new file mode 100644
index 0000000000..57bf6da406
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -0,0 +1,86 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index fae7978d3d..c814e7b3bb 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -26,128 +26,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const GPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
-  } else {
-    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  const auto place = dev_ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
-        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
-  }
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -211,138 +89,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(fmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
new file mode 100644
index 0000000000..3442d7f028
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
@@ -0,0 +1,82 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
new file mode 100644
index 0000000000..20f3b73e40
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -0,0 +1,83 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
+  } else {
+    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index be6cdc7825..84768866cc 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,7 +23,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 8c37091ef1..371644e643 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index e4356e9af3..ab1c33d50a 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 5e06435b28..f39786fff2 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,7 +16,9 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index d5c2c559b2..b126ca9b84 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -22,6 +22,48 @@
 #endif
 
 namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
 template <typename T, typename Context>
 void FMaxKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
new file mode 100644
index 0000000000..b5532c6143
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(add_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
new file mode 100644
index 0000000000..852babe29d
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(divide,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 01a34c0f85..5ccd3b1a48 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -22,34 +22,6 @@
 
 namespace phi {
 
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 // Create the definition of Minimum
@@ -64,12 +36,6 @@ DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
-PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
@@ -89,57 +55,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, KPS, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
new file mode 100644
index 0000000000..8bede0198c
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(multiply,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
new file mode 100644
index 0000000000..757dedb99c
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
similarity index 96%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.cc
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 7fba3244a6..9fe8eef7ec 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/selected_rows/elementwise_kernel.h"
+#include "paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.h b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
similarity index 100%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.h
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index d4013a788c..fb4c68a87c 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(subtract, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 9552c02976..36b200d4d4 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-- 
GitLab