diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 5e790389819f53b250db8797c7a8b3466818abfb..287d6e770dea2cce4387e8aebf53f2826f71725e 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -42,7 +42,7 @@ using namespace egr_utils_api;  // NOLINT
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index a3e393b039425e506066b485bc8a8688bff20d96..d9afd7cc96523729833ecf3cd72456f07f761586 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -41,7 +41,7 @@
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index d2bef100ca2b581f230816ca56ebeda97bfc8a0c..2d69380cf78d90c69d1e0760ca77d584708f284c 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace egr {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 5efd0fb42077f0af79579a501fd3a3caaa9a25dd..e03277fb3179980baac980ba503c40b590309bc1 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -69,14 +69,17 @@ PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
-PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 #else
-PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
 #endif
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 1c3a04b51abd036325801af484bb1d800152c328..7bfb3094ba2869269e178d53ec9ed0a834ee9d29 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -38,6 +38,7 @@ PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 358d0fc6d078e124e1629df3eee730d5d5d8b079..68bc3a0eb5c531628b62dc56b0080fb04ee57105 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -30,7 +30,7 @@ USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 3e363c56eb93e52f80375ac70bacd7b65c626144..4f922945eae00a7188d147e21e0bc6216008c39c 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -36,7 +36,7 @@ DECLARE_bool(enable_pe_launch_cinn);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index a1fe8a25665ec84b38a535f541a2cbe33d0a7fcf..702ff3bfd87b0c439483afeca3761d460a8aa862 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 6daf05a9d778dfb194225f59321ffc3eb40235db..fe898a6c41c2a57349ff09d2cc23b7aea2b66cb9 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,7 +21,9 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index c28abb916b7a7d59d5a1974bed63e43b2f32ef2c..d77d4ed036394e50f1cf6affcfa7045636df6ac0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 // only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index ce5c6b701d95894db8e3a84215f537352914706a..f28aae9eed37bc2d0272db18648c55c4d9eb3513 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -26,6 +26,9 @@
 USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index e5ebdad1e443476baee3f511dc446c40bb59dfd9..61b80219a26b4e352ed52919b69329a0052e9d92 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -33,6 +33,7 @@ USE_OP_ITSELF(elementwise_add);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 // get paddle matmul op results as baseline
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index d6170b7000d635fd306f82efd34074ca0945f476..8ef3d60c0dc0cc4035eeb6ed213a2abb726807eb 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8a89b997b41395a8e8da6bb6c4861b2eb974f29
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+        dev_ctx,
+        x,
+        y,
+        out,
+        dout,
+        axis,
+        dx,
+        dy,
+        IdentityGrad<T>(),
+        IdentityGrad<T>());
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..607026454724937db841150bdf96d6ea8b976a4c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6541ec0e681847d2d45da5ff70fba4075562d8e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d380621818b3595f3df5f08653848ec60eb741ab
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index f452d9ffb7e8950ba2beec7b6cb1b7b09e746851..3f5e0b8a4d8ee6266e96afb2431e8ac38b68e3f9 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -23,118 +23,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const CPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-        dev_ctx,
-        x,
-        y,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        IdentityGrad<T>(),
-        IdentityGrad<T>());
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
-      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  auto* out = &dout;  // out is not necessary
-  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
-      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -163,129 +51,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
 PD_REGISTER_KERNEL(fmax_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index a91ca1ee3244bdbdde2c9c248317e40d45b3dc17..7478f69d915f1fbfd83363b212da9d39f4718f30 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -21,54 +21,6 @@
 
 namespace phi {
 
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -142,14 +94,6 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
   funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
       dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
 }
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
 
 }  // namespace phi
 
@@ -165,51 +109,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6055541c805f0adef86b50e2944d821ea952dcbc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2424a5330109c06d132c5029f1b7b3508aaa022e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c785eacb9a8bc75ac39ab645cc38f9cee81b58bb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -0,0 +1,75 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e97852ac33e14b7aa321746e67ee9894f88deea
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 77c763171088cfb049f8ceed170a96e2ba4ad00d..3bfc07319e98dac12fcec00a6172ea113f654b29 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,7 +17,7 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/elementwise_add_grad_kernel.h b/paddle/phi/kernels/elementwise_add_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b754cfefe3657e1c0b5e1ad1528d4b42b023cd1
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy);
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_add_kernel.h b/paddle/phi/kernels/elementwise_add_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3245c450aaebea72e5d642a39a8e33b95aa8ce9f
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d29dae99a1314a750faac8479cfab3bd2b56664
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_kernel.h b/paddle/phi/kernels/elementwise_divide_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5555b69fde1dea84870bb19bd16d9b65fb92786e
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index 0e730fbfbfa4de7fddc29d648b8a40d5e3e31951..6f2f2915ecf9e7e1bb6096d3cf1a257d588e826a 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -19,111 +19,6 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy);
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout);
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy);
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy);
-
-template <typename T, typename Context>
-void DivideDoubleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
-                            int axis,
-                            DenseTensor* dy,
-                            DenseTensor* dout,
-                            DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void MultiplyDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              int axis,
-                              DenseTensor* dx,
-                              DenseTensor* dy,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyTripleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& d_dx,
-                              const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
-                              int axis,
-                              DenseTensor* d_x,
-                              DenseTensor* d_y,
-                              DenseTensor* d_dout,
-                              DenseTensor* d_ddx,
-                              DenseTensor* d_ddy);
-
 template <typename T, typename Context>
 void ElementwiseFMaxGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 6cd602e47b8e6ba2b3085abfad58d3b229f77b7d..4cee24d2f8069d899e2ef84d5802149dc146e68a 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -19,42 +19,6 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out) {
-  int axis = -1;
-  AddRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out) {
-  int axis = -1;
-  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -105,51 +69,6 @@ void ElementwisePowKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum,
                    CPU,
                    ALL_LAYOUT,
@@ -183,57 +102,6 @@ PD_REGISTER_KERNEL(elementwise_pow,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-PD_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
 PD_REGISTER_KERNEL(maximum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 09b6b02e372571095de200b856a2a0103ef05182..37fe895d4051fe3d95a0ac46cf3fd45a7c022ed3 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -33,58 +33,6 @@ void FMinKernel(const Context& dev_ctx,
                 int axis,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -150,50 +98,6 @@ void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& y,
                           DenseTensor* out);
 
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename Context>
 DenseTensor Maximum(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..517948a50d1b18380ba29334d0b91d4a32225e88
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_multiply_kernel.h b/paddle/phi/kernels/elementwise_multiply_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..608ae95d2ba4b8ee4a3f9b38d3387faafc8589ab
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7be91b4b9f4cdfcfac5e448ef7e803436b937504
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_kernel.h b/paddle/phi/kernels/elementwise_subtract_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f6c4383df5d8661a766600a1f969aa6ffb90231
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8dd4d0184c267613615af7cbf266201785149e72
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -0,0 +1,118 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const GPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..57bf6da4060d34acdbf44ac80d4577e24247417d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -0,0 +1,86 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index fae7978d3d2ea0518879224364335eea68b3a831..c814e7b3bb63d659fec5919f7506b2d63bfd3508 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -26,128 +26,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const GPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
-  } else {
-    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  const auto place = dev_ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
-        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
-  }
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -211,138 +89,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(fmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3442d7f02853913a200c413fe1f7ebe0690e2627
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
@@ -0,0 +1,82 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..20f3b73e4094f651827504b26eb6f1f9cd11af21
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -0,0 +1,83 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
+  } else {
+    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index be6cdc7825575857eb4504895a1c1b9ee521895d..84768866cc9e7582b711be274ed79028a89788a2 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,7 +23,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 8c37091ef1b54def8ef81161a4ac0a85fc265967..371644e6434a43bb1a74f7d298a02bc1ebbee760 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index e4356e9af39372cd330991502078a13520d05586..ab1c33d50a45626a238b25ab6ecbd9cb4944a491 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 5e06435b28e2719c2e9fc18de034073f9674a977..f39786fff2665f43b14667f5d9cbea78712bf3ff 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,7 +16,9 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index d5c2c559b2c0690c7bb6c85261353791448f030f..b126ca9b842275610d2dc238a9cd517089d93a7d 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -22,6 +22,48 @@
 #endif
 
 namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
 template <typename T, typename Context>
 void FMaxKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b5532c614314f0064c508555171a7099e6fcdd5f
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(add_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..852babe29dbf7a23b5386f40aec97aa5810a659d
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(divide,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 01a34c0f85eda0302fa581d7c65137c1923677b9..5ccd3b1a482109bfa04a84844b23d95b9e38186e 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -22,34 +22,6 @@
 
 namespace phi {
 
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 // Create the definition of Minimum
@@ -64,12 +36,6 @@ DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
-PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
@@ -89,57 +55,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, KPS, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8bede0198c2fa100b90835e66968a147485558b3
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(multiply,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..757dedb99c9310f3fe1600bd7c204bd5dcda3f5e
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
similarity index 96%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.cc
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 7fba3244a60eedc032290aa82f99e9eb01e0ff6b..9fe8eef7ec82a3ea51206fda5bb8ddff4718eb98 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/selected_rows/elementwise_kernel.h"
+#include "paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.h b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
similarity index 100%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.h
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index d4013a788c76cb4f049574ee893320088fe7ac2c..fb4c68a87cb25be3c3c154cbb5b334fb0e3702c8 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(subtract, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 9552c02976f30d11601967034815545f94ff1f97..36b200d4d44940b2b7f73e75ea7339517dd5f5e3 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"