[PHI]Mv xpu elementwise add kernel to phi (#45473)

* mv elementwise add to xpu , test=kunlun * fix ci bugs, test=kunlun * fix ci bugs , test=kunlun

[PHI]Mv xpu elementwise add kernel to phi (#45473)
* mv elementwise add to xpu , test=kunlun * fix ci bugs, test=kunlun * fix ci bugs , test=kunlun
bb3e4e0c · YuanRisheng · GitHub · 632bc1f2 · 632bc1f2 · bb3e4e0c
5 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    XPUElementwise<T, XPUType>(ctx, xpu::broadcast_add<XPUType>);
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    const framework::DDim& dz_dims = dz->dims();
-    int axis = ctx.Attr<int>("axis");
-
-    const T* dz_data = dz->data<T>();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-    if (dx != nullptr) {
-      T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() == dz_dims) {
-        if (dx_data != dz_data) {
-          framework::TensorCopy(
-              *dz,
-              ctx.GetPlace(),
-              ctx.template device_context<platform::DeviceContext>(),
-              dx);
-        }
-      } else {
-        // For inplace strategy, dx will be stored in addr of dz, which makes
-        // the result of dy wrong.
-        if (dx->IsSharedBufferWith(*dz)) {
-          dx->clear();
-          dx->mutable_data<T>(x->dims(), ctx.GetPlace());
-        }
-        std::vector<int> reduce_dims = GetReduceDim(dx->dims(), dz_dims, axis);
-        std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
-
-        int ret =
-            xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
-                                     reinterpret_cast<const XPUType*>(dz_data),
-                                     reinterpret_cast<XPUType*>(dx->data<T>()),
-                                     dz_vector,
-                                     reduce_dims);
-        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
-      }
-    }
-
-    if (dy != nullptr) {
-      T* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() == dz_dims) {
-        if (dy_data != dz_data) {
-          framework::TensorCopy(
-              *dz,
-              ctx.GetPlace(),
-              ctx.template device_context<platform::DeviceContext>(),
-              dy);
-        }
-      } else {
-        std::vector<int> reduce_dims = GetReduceDim(dy->dims(), dz_dims, axis);
-        std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
-        int ret =
-            xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
-                                     reinterpret_cast<const XPUType*>(dz_data),
-                                     reinterpret_cast<XPUType*>(dy_data),
-                                     dz_vector,
-                                     reduce_dims);
-        PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_XPU_KERNEL(elementwise_add,
-                       ops::ElementwiseAddXPUKernel<float>,
-                       ops::ElementwiseAddXPUKernel<paddle::platform::float16>);
-REGISTER_OP_XPU_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradXPUKernel<float>,
-    ops::ElementwiseAddGradXPUKernel<paddle::platform::float16>);
-#endif
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -21,6 +21,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/xpu/elementwise.h"
 #include "xpu/refactor/math.h"

 namespace paddle {
@@ -48,67 +49,11 @@ void XPUElementwise(const framework::ExecutionContext& ctx,
  auto x = x_var->Get<framework::LoDTensor>();
  auto* y = ctx.Input<framework::LoDTensor>("Y");
  auto* z = ctx.Output<framework::LoDTensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-  auto x_dims = x.dims();
-  auto y_dims = y->dims();
-  int max_dim = std::max(x_dims.size(), y_dims.size());
  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_vec(max_dim, 1);
-  std::vector<int> y_dims_vec(max_dim, 1);
-  if (x_dims.size() == max_dim) {
-    for (int i = 0; i < max_dim; i++) {
-      x_dims_vec[i] = x_dims[i];
-    }
-  } else {
-    for (int i = 0; i < x_dims.size(); i++) {
-      x_dims_vec[i + axis] = x_dims[i];
-    }
-  }
-  if (y_dims.size() == max_dim) {
-    for (int i = 0; i < max_dim; i++) {
-      y_dims_vec[i] = y_dims[i];
-    }
-  } else {
-    for (int i = 0; i < y_dims.size(); i++) {
-      y_dims_vec[i + axis] = y_dims[i];
-    }
-  }
-  const T* x_data = x.data<T>();
-  const T* y_data = y->data<T>();
-  T* z_data = z->data<T>();

  auto& dev_ctx =
      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-  int ret = xpu::SUCCESS;
-
-  ret = func(dev_ctx.x_context(),
-             reinterpret_cast<const XPUType*>(x_data),
-             reinterpret_cast<const XPUType*>(y_data),
-             reinterpret_cast<XPUType*>(z_data),
-             x_dims_vec,
-             y_dims_vec);
-  PADDLE_ENFORCE_EQ(
-      ret,
-      xpu::SUCCESS,
-      platform::errors::External(
-          "XPU kernel Elementwise occur error in XPUElementwise error code ",
-          ret,
-          XPUAPIErrorMsg[ret]));
+  phi::XPUElementwise<T, XPUType>(dev_ctx, x, *y, axis, z, func);
 }

 template <typename T, typename XPUType>
@@ -128,78 +73,12 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
  auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
  auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
  auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-  auto* z = dz;
  int axis = ctx.Attr<int>("axis");
-  const framework::DDim& x_dims = x->dims();
-  const framework::DDim& y_dims = y->dims();
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_vec(max_dim, 1);
-  std::vector<int> y_dims_vec(max_dim, 1);
-  if (x_dims.size() == max_dim) {
-    for (int i = 0; i < max_dim; i++) {
-      x_dims_vec[i] = x_dims[i];
-    }
-  } else {
-    for (int i = 0; i < x_dims.size(); i++) {
-      x_dims_vec[i + axis] = x_dims[i];
-    }
-  }
-  if (y_dims.size() == max_dim) {
-    for (int i = 0; i < max_dim; i++) {
-      y_dims_vec[i] = y_dims[i];
-    }
-  } else {
-    for (int i = 0; i < y_dims.size(); i++) {
-      y_dims_vec[i + axis] = y_dims[i];
-    }
-  }
-
-  const T* x_data = use_x_y_data ? x->data<T>() : z->data<T>();
-  const T* y_data = use_x_y_data ? y->data<T>() : z->data<T>();
-  const T* z_data = z->data<T>();

-  const T* dz_data = dz->data<T>();
-  T* dx_data = nullptr;
-  T* dy_data = nullptr;
  auto& dev_ctx =
      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-  if (dx) {
-    dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  }
-  if (dy) {
-    dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  }
-
-  int ret = func(dev_ctx.x_context(),
-                 reinterpret_cast<const XPUType*>(x_data),
-                 reinterpret_cast<const XPUType*>(y_data),
-                 reinterpret_cast<const XPUType*>(z_data),
-                 reinterpret_cast<const XPUType*>(dz_data),
-                 reinterpret_cast<XPUType*>(dy_data),
-                 reinterpret_cast<XPUType*>(dx_data),
-                 x_dims_vec,
-                 y_dims_vec);
-  PADDLE_ENFORCE_EQ(
-      ret,
-      xpu::SUCCESS,
-      platform::errors::External(
-          "XPU kernel Elementwise occur error in XPUElementwise error code ",
-          ret,
-          XPUAPIErrorMsg[ret]));
+  phi::XPUElementwiseGrad<T, XPUType>(
+      dev_ctx, *x, *y, *dz, axis, dx, dy, func, use_x_y_data);
 }

 }  // namespace operators

--- a/paddle/phi/kernels/xpu/elementwise.h
+++ b/paddle/phi/kernels/xpu/elementwise.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "xpu/refactor/math.h"
+
+namespace phi {
+
+template <typename T, typename XPUType>
+void XPUElementwise(const XPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* z,
+                    std::function<int(xpu::Context*,
+                                      const XPUType*,
+                                      const XPUType*,
+                                      XPUType*,
+                                      const std::vector<int>&,
+                                      const std::vector<int>&)> func) {
+  dev_ctx.template Alloc<T>(z);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_vec(max_dim, 1);
+  std::vector<int> y_dims_vec(max_dim, 1);
+  if (x_dims.size() == max_dim) {
+    for (int i = 0; i < max_dim; i++) {
+      x_dims_vec[i] = x_dims[i];
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      x_dims_vec[i + axis] = x_dims[i];
+    }
+  }
+  if (y_dims.size() == max_dim) {
+    for (int i = 0; i < max_dim; i++) {
+      y_dims_vec[i] = y_dims[i];
+    }
+  } else {
+    for (int i = 0; i < y_dims.size(); i++) {
+      y_dims_vec[i + axis] = y_dims[i];
+    }
+  }
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  T* z_data = z->data<T>();
+
+  int ret = xpu::SUCCESS;
+
+  ret = func(dev_ctx.x_context(),
+             reinterpret_cast<const XPUType*>(x_data),
+             reinterpret_cast<const XPUType*>(y_data),
+             reinterpret_cast<XPUType*>(z_data),
+             x_dims_vec,
+             y_dims_vec);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      xpu::SUCCESS,
+      errors::External(
+          "XPU kernel Elementwise occur error in XPUElementwise error code ",
+          ret,
+          XPUAPIErrorMsg[ret]));
+}
+
+template <typename T, typename XPUType>
+void XPUElementwiseGrad(const XPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dz,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy,
+                        std::function<int(xpu::Context*,
+                                          const XPUType*,
+                                          const XPUType*,
+                                          const XPUType*,
+                                          const XPUType*,
+                                          XPUType*,
+                                          XPUType*,
+                                          const std::vector<int>&,
+                                          const std::vector<int>&)> func,
+                        bool use_x_y_data) {
+  auto* z = &dz;
+  const DDim& x_dims = x.dims();
+  const DDim& y_dims = y.dims();
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_vec(max_dim, 1);
+  std::vector<int> y_dims_vec(max_dim, 1);
+  if (x_dims.size() == max_dim) {
+    for (int i = 0; i < max_dim; i++) {
+      x_dims_vec[i] = x_dims[i];
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      x_dims_vec[i + axis] = x_dims[i];
+    }
+  }
+  if (y_dims.size() == max_dim) {
+    for (int i = 0; i < max_dim; i++) {
+      y_dims_vec[i] = y_dims[i];
+    }
+  } else {
+    for (int i = 0; i < y_dims.size(); i++) {
+      y_dims_vec[i + axis] = y_dims[i];
+    }
+  }
+
+  const T* x_data = use_x_y_data ? x.data<T>() : z->data<T>();
+  const T* y_data = use_x_y_data ? y.data<T>() : z->data<T>();
+  const T* z_data = z->data<T>();
+
+  const T* dz_data = dz.data<T>();
+  T* dx_data = nullptr;
+  T* dy_data = nullptr;
+
+  if (dx) {
+    dx_data = dev_ctx.template Alloc<T>(dx);
+  }
+  if (dy) {
+    dy_data = dev_ctx.template Alloc<T>(dy);
+  }
+
+  int ret = func(dev_ctx.x_context(),
+                 reinterpret_cast<const XPUType*>(x_data),
+                 reinterpret_cast<const XPUType*>(y_data),
+                 reinterpret_cast<const XPUType*>(z_data),
+                 reinterpret_cast<const XPUType*>(dz_data),
+                 reinterpret_cast<XPUType*>(dy_data),
+                 reinterpret_cast<XPUType*>(dx_data),
+                 x_dims_vec,
+                 y_dims_vec);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      xpu::SUCCESS,
+      errors::External(
+          "XPU kernel Elementwise occur error in XPUElementwise error code ",
+          ret,
+          XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+#endif
--- a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/backends/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* dz = &dout;
+  const DDim& dz_dims = dz->dims();
+
+  const T* dz_data = dz->data<T>();
+
+  if (dx != nullptr) {
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+    if (dx->dims() == dz_dims) {
+      if (dx_data != dz_data) {
+        Copy(dev_ctx, *dz, dev_ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dz, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(*dz)) {
+        dx->clear();
+        dx->Resize(x.dims());
+        dev_ctx.template Alloc<T>(dx);
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(dx->dims(), dz_dims, axis);
+      std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
+
+      int ret =
+          xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
+                                   reinterpret_cast<const XPUType*>(dz_data),
+                                   reinterpret_cast<XPUType*>(dx->data<T>()),
+                                   dz_vector,
+                                   reduce_dims);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
+    }
+  }
+
+  if (dy != nullptr) {
+    T* dy_data = dy->mutable_data<T>(dev_ctx.GetPlace());
+    if (dy->dims() == dz_dims) {
+      if (dy_data != dz_data) {
+        Copy(dev_ctx, *dz, dev_ctx.GetPlace(), false, dy);
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(dy->dims(), dz_dims, axis);
+      std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
+      int ret =
+          xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
+                                   reinterpret_cast<const XPUType*>(dz_data),
+                                   reinterpret_cast<XPUType*>(dy_data),
+                                   dz_vector,
+                                   reduce_dims);
+      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    add_grad, XPU, ALL_LAYOUT, phi::AddGradKernel, phi::dtype::float16, float) {
+}
--- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -12,10 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
+#include <string>
+
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/backends/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+#include "paddle/phi/kernels/xpu/elementwise.h"

 namespace phi {

@@ -38,6 +47,25 @@ void GradAddXPUKernel(const Context& dev_ctx,
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
 }

+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  XPUElementwise<T, XPUType>(
+      dev_ctx, x, y, axis, out, xpu::broadcast_add<XPUType>);
+}
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  AddRawKernel<T, Context>(dev_ctx, x, y, -1, out);
+}
+
 }  // namespace phi

 PD_REGISTER_KERNEL(grad_add,
@@ -46,3 +74,8 @@ PD_REGISTER_KERNEL(grad_add,
                   phi::GradAddXPUKernel,
                   phi::dtype::float16,
                   float) {}
+PD_REGISTER_KERNEL(
+    add_raw, XPU, ALL_LAYOUT, phi::AddRawKernel, phi::dtype::float16, float) {}
+
+PD_REGISTER_KERNEL(
+    add, XPU, ALL_LAYOUT, phi::AddKernel, phi::dtype::float16, float) {}