xpu kernels support api int64 vector inputs, test=kunlun (#49336)

3c2420a3 · ykkk2333 · GitHub · 418edae5 · 3c2420a3 · 3c2420a3
16 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
  set(XPU_BASE_URL_WITHOUT_DATE
      "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221215")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221227")
 else()
  set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

--- a/paddle/phi/kernels/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/xpu/compare_kernel.cc
@@ -52,22 +52,30 @@ void XPUCompareKernelImpl(const Context& dev_ctx,
  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "compare op");
 }

-#define DEFINE_XPU_COMPARE_KERNEL(name, functor)                            \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    using XPUType = typename XPUTypeTrait<T>::Type;                         \
-    XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, functor); \
-  }                                                                         \
-  template <typename T, typename Context>                                   \
-  void name##Kernel(const Context& dev_ctx,                                 \
-                    const DenseTensor& x,                                   \
-                    const DenseTensor& y,                                   \
-                    DenseTensor* out) {                                     \
-    name##RawKernel<T, Context>(dev_ctx, x, y, -1, out);                    \
+#define DEFINE_XPU_COMPARE_KERNEL(name, functor)                      \
+  template <typename T, typename Context>                             \
+  void name##RawKernel(const Context& dev_ctx,                        \
+                       const DenseTensor& x,                          \
+                       const DenseTensor& y,                          \
+                       int axis,                                      \
+                       DenseTensor* out) {                            \
+    using XPUType = typename XPUTypeTrait<T>::Type;                   \
+    auto f = [](xpu::Context* ctx,                                    \
+                const XPUType* x,                                     \
+                const XPUType* y,                                     \
+                bool* z,                                              \
+                const std::vector<int>& xshape,                       \
+                const std::vector<int>& yshape) {                     \
+      return functor(ctx, x, y, z, xshape, yshape);                   \
+    };                                                                \
+    XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \
+  }                                                                   \
+  template <typename T, typename Context>                             \
+  void name##Kernel(const Context& dev_ctx,                           \
+                    const DenseTensor& x,                             \
+                    const DenseTensor& y,                             \
+                    DenseTensor* out) {                               \
+    name##RawKernel<T, Context>(dev_ctx, x, y, -1, out);              \
  }

 DEFINE_XPU_COMPARE_KERNEL(Equal, xpu::broadcast_equal<XPUType>)

--- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -54,8 +54,17 @@ void AddRawKernel(const Context& dev_ctx,
                  int axis,
                  DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_add<XPUType>);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_divide_grad_kernel.cc
@@ -35,15 +35,21 @@ void DivideGradKernel(const Context& dev_ctx,
                      DenseTensor* dy) {
  using XPUType = typename XPUTypeTrait<T>::Type;
  funcs::ElementwiseGradPreProcess(dout, dx);
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_div_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_div_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_divide_kernel.cc
@@ -31,8 +31,16 @@ void DivideRawKernel(const Context& dev_ctx,
                     int axis,
                     DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_div<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_div<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_grad_kernel.cc
@@ -29,15 +29,21 @@ void MaximumGradKernel(const Context& dev_ctx,
                       DenseTensor* dx,
                       DenseTensor* dy) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_max_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_max_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }

 template <typename T, typename Context>
@@ -49,15 +55,21 @@ void MinimumGradKernel(const Context& dev_ctx,
                       DenseTensor* dx,
                       DenseTensor* dy) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_min_grad<XPUType>,
-                                 true);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_min_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -27,8 +27,16 @@ void FloorDivideRawKernel(const Context& dev_ctx,
                          int axis,
                          DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_floordiv<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 template <typename T, typename Context>
@@ -38,8 +46,16 @@ void MaximumRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_max<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 template <typename T, typename Context>
@@ -49,8 +65,16 @@ void MinimumRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_min<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 template <typename T, typename Context>
@@ -60,8 +84,16 @@ void RemainderRawKernel(const Context& dev_ctx,
                        int axis,
                        DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_mod<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 template <typename T, typename Context>
@@ -71,8 +103,16 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
                             int axis,
                             DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_pow<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
@@ -34,15 +34,20 @@ void MultiplyGradKernel(const Context& dev_ctx,
                        DenseTensor* dy) {
  using XPUType = typename XPUTypeTrait<T>::Type;
  funcs::ElementwiseGradPreProcess(dout, dx);
-  XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                 x,
-                                 y,
-                                 dout,
-                                 axis,
-                                 dx,
-                                 dy,
-                                 xpu::broadcast_mul_grad<XPUType>,
-                                 true);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mul_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
@@ -31,8 +31,16 @@ void MultiplyRawKernel(const Context& dev_ctx,
                       int axis,
                       DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_mul<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_mul<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_subtract_grad_kernel.cc
@@ -28,15 +28,22 @@ void SubtractGradKernel(const Context& dev_ctx,
                        DenseTensor* dx,
                        DenseTensor* dy) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  phi::XPUElementwiseGrad<T, XPUType>(dev_ctx,
-                                      x,
-                                      y,
-                                      dout,
-                                      axis,
-                                      dx,
-                                      dy,
-                                      xpu::broadcast_sub_grad<XPUType>,
-                                      false);
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              const XPUType* z,
+              const XPUType* dz,
+              XPUType* dy,
+              XPUType* dx,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_sub_grad<XPUType>(
+        ctx, x, y, z, dz, dy, dx, xshape, yshape);
+  };
+
+  phi::XPUElementwiseGrad<T, XPUType>(
+      dev_ctx, x, y, dout, axis, dx, dy, f, false);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc
@@ -26,8 +26,16 @@ void SubtractRawKernel(const Context& dev_ctx,
                       int axis,
                       DenseTensor* out) {
  using XPUType = typename XPUTypeTrait<T>::Type;
-  phi::XPUElementwise<T, XPUType>(
-      dev_ctx, x, y, axis, out, xpu::broadcast_sub<XPUType>);
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              const XPUType* y,
+              XPUType* z,
+              const std::vector<int>& xshape,
+              const std::vector<int>& yshape) {
+    return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
+  };
+
+  phi::XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/prod_kernel.cc
+++ b/paddle/phi/kernels/xpu/prod_kernel.cc
@@ -29,13 +29,18 @@ void ProdRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DenseTensor* out) {
  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_prod<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_prod<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, T>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_prod");
 }


--- a/paddle/phi/kernels/xpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_max_kernel.cc
@@ -29,13 +29,17 @@ void MaxRawKernel(const Context& dev_ctx,
                  bool reduce_all,
                  DenseTensor* out) {
  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_max<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_max<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
 }


--- a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc
@@ -29,13 +29,18 @@ void MeanRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DenseTensor* out) {
  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_mean<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_mean<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
+
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_mean");
 }


--- a/paddle/phi/kernels/xpu/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_min_kernel.cc
@@ -29,13 +29,18 @@ void MinRawKernel(const Context& dev_ctx,
                  bool reduce_all,
                  DenseTensor* out) {
  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_min<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_min<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min");
 }


--- a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc
@@ -30,13 +30,17 @@ void SumRawKernel(const Context& dev_ctx,
                  DataType out_dtype,
                  DenseTensor* out) {
  reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  int r = XPUReduce<Context, T>(dev_ctx,
-                                x,
-                                dims.GetData(),
-                                keep_dim,
-                                reduce_all,
-                                out,
-                                xpu::reduce_sum<T>);
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto f = [](xpu::Context* ctx,
+              const XPUType* x,
+              XPUType* y,
+              const std::vector<int>& xdims,
+              const std::vector<int>& reduce_dims) {
+    return xpu::reduce_sum<XPUType>(ctx, x, y, xdims, reduce_dims);
+  };
+  int r = XPUReduce<Context, XPUType>(
+      dev_ctx, x, dims.GetData(), keep_dim, reduce_all, out, f);
  PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 }