add xpu tile and concat kernel int64, test=kunlun (#51349)

04f56338 · ykkk2333 · GitHub · 615fc429 · 04f56338 · 04f56338
4 changed file
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -140,6 +140,7 @@ XPUOpMap& get_kl2_ops() {
      {"concat",
       XPUKernelSet({phi::DataType::FLOAT32,
                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
                     phi::DataType::INT64,
                     phi::DataType::INT32})},
      {"conv2d_grad",
@@ -730,6 +731,7 @@ XPUOpMap& get_kl2_ops() {
       XPUKernelSet({phi::DataType::INT32,
                     phi::DataType::INT64,
                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT64,
                     phi::DataType::FLOAT32})},
      {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"transpose2_grad",

--- a/paddle/phi/kernels/xpu/concat_kernel.cc
+++ b/paddle/phi/kernels/xpu/concat_kernel.cc
@@ -116,6 +116,7 @@ PD_REGISTER_KERNEL(concat,
                   ALL_LAYOUT,
                   phi::ConcatKernel,
                   float,
+                   double,
                   phi::dtype::float16,
                   int64_t,
                   int) {}
--- a/paddle/phi/kernels/xpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/pool_grad_kernel.cc
@@ -112,6 +112,33 @@ void Pool2dGradKernel(const Context& ctx,
          true);
    } else if (pooling_type == "avg") {
+      // When output dim is 1 * 1 (1 * 1 * 1 in pool_3d), use scale
+      // and broadcast kernels to get same output, but better performance.
+      // Since the dim is special in particular models,
+      // use 'export XPU_POOLING_GRAD_SPECIAL=1' to open this path
+      if (out_h == 1 && out_w == 1 && std::is_same<T, float>::value &&
+          std::getenv("XPU_POOLING_GRAD_SPECIAL") != nullptr) {
+        xpu::ctx_guard RAII_GUARD(ctx.x_context());
+        float scale = 1.0 / (in_h * in_w);
+        float* scaled_dy = RAII_GUARD.alloc_l3_or_gm<float>(n * c);
+        r = xpu::scale(ctx.x_context(),
+                       dout.data<float>(),
+                       scaled_dy,
+                       n * c,
+                       true,
+                       scale,
+                       0.0);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+        r = xpu::broadcast(ctx.x_context(),
+                           scaled_dy,
+                           dx->data<float>(),
+                           {n, c, 1, 1},
+                           {n, c, in_h, in_w});
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+        return;
+      }
      r = xpu::adaptive_avg_pool2d_grad<XPUType>(
          ctx.x_context(),
          reinterpret_cast<const XPUType*>(dout.data<T>()),
@@ -267,6 +294,31 @@ void Pool3dGradKernel(const Context& ctx,
          !channel_last);
    } else if (pooling_type == "avg") {
+      if (out_d == 1 && out_h == 1 && out_w == 1 &&
+          std::is_same<T, float>::value &&
+          std::getenv("XPU_POOLING_GRAD_SPECIAL") != nullptr) {
+        xpu::ctx_guard RAII_GUARD(ctx.x_context());
+        float scale = 1.0 / (in_d * in_h * in_w);
+        float* scaled_dy = RAII_GUARD.alloc_l3_or_gm<float>(n * c);
+        r = xpu::scale(ctx.x_context(),
+                       dout.data<float>(),
+                       scaled_dy,
+                       n * c,
+                       true,
+                       scale,
+                       0.0);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+        r = xpu::broadcast(ctx.x_context(),
+                           scaled_dy,
+                           dx->data<float>(),
+                           {n, c, 1, 1, 1},
+                           {n, c, in_d, in_h, in_w});
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast");
+        return;
+      }
      r = xpu::adaptive_avg_pool3d_grad<XPUType>(
          ctx.x_context(),
          reinterpret_cast<const XPUType*>(dout.data<T>()),

--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
@@ -29,6 +29,7 @@ void TileKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const IntArray& repeat_times_arr,
                DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
  auto rank = x.dims().size();
  PADDLE_ENFORCE_GE(
      rank,
@@ -104,12 +105,21 @@ void TileKernel(const Context& dev_ctx,
  if (repeat_times == temp) {
    out->Resize(x.dims());
    dev_ctx.template Alloc<T>(out);
-    int r =
+    if (std::is_same<T, double>::value) {
-        xpu::copy(dev_ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+      int r = xpu::copy(dev_ctx.x_context(),
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+                        reinterpret_cast<const int8_t*>(x.data<double>()),
+                        reinterpret_cast<int8_t*>(out->data<double>()),
+                        8 * x.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    } else {
+      int r = xpu::copy(
+          dev_ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    }
    return;
  }
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
  int ret = XPU_SUCCESS;
  if (std::is_same<T, bool>::value) {
    ret = xpu::broadcast<int8_t>(dev_ctx.x_context(),
@@ -118,6 +128,24 @@ void TileKernel(const Context& dev_ctx,
                                 vec_in_dims,
                                 vec_out_dims);
+  } else if (std::is_same<T, double>::value) {
+    float* x_t = RAII_GUARD.alloc_l3_or_gm<float>(x.numel());
+    float* y_t = RAII_GUARD.alloc_l3_or_gm<float>(out->numel());
+    int r =
+        xpu::cast<XPUType, float>(dev_ctx.x_context(),
+                                  reinterpret_cast<const XPUType*>(x.data<T>()),
+                                  x_t,
+                                  x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+    ret = xpu::broadcast<float>(
+        dev_ctx.x_context(), x_t, y_t, vec_in_dims, vec_out_dims);
+    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast");
+    r = xpu::cast<float, XPUType>(dev_ctx.x_context(),
+                                  y_t,
+                                  reinterpret_cast<XPUType*>(out->data<T>()),
+                                  out->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
  } else {
    ret = xpu::broadcast<T>(dev_ctx.x_context(),
                            x.data<T>(),
@@ -131,4 +159,5 @@ void TileKernel(const Context& dev_ctx,
 }  // namespace phi
 PD_REGISTER_KERNEL(
-    tile, XPU, ALL_LAYOUT, phi::TileKernel, bool, float, int, int64_t) {}
+    tile, XPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) {
+}