From 04f56338fe26cab17722aeee805dafecd1e37bd5 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:44:36 +0800 Subject: [PATCH] add xpu tile and concat kernel int64, test=kunlun (#51349) --- paddle/phi/backends/xpu/xpu2_op_list.cc | 2 + paddle/phi/kernels/xpu/concat_kernel.cc | 1 + paddle/phi/kernels/xpu/pool_grad_kernel.cc | 52 ++++++++++++++++++++++ paddle/phi/kernels/xpu/tile_kernel.cc | 37 +++++++++++++-- 4 files changed, 88 insertions(+), 4 deletions(-) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index d0814e79444..77bff35611a 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -140,6 +140,7 @@ XPUOpMap& get_kl2_ops() { {"concat", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16, + phi::DataType::FLOAT64, phi::DataType::INT64, phi::DataType::INT32})}, {"conv2d_grad", @@ -730,6 +731,7 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64, phi::DataType::BOOL, + phi::DataType::FLOAT64, phi::DataType::FLOAT32})}, {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"transpose2_grad", diff --git a/paddle/phi/kernels/xpu/concat_kernel.cc b/paddle/phi/kernels/xpu/concat_kernel.cc index 0bd180b692b..f1fac997061 100644 --- a/paddle/phi/kernels/xpu/concat_kernel.cc +++ b/paddle/phi/kernels/xpu/concat_kernel.cc @@ -116,6 +116,7 @@ PD_REGISTER_KERNEL(concat, ALL_LAYOUT, phi::ConcatKernel, float, + double, phi::dtype::float16, int64_t, int) {} diff --git a/paddle/phi/kernels/xpu/pool_grad_kernel.cc b/paddle/phi/kernels/xpu/pool_grad_kernel.cc index 5be41599429..6f937b93e19 100644 --- a/paddle/phi/kernels/xpu/pool_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/pool_grad_kernel.cc @@ -112,6 +112,33 @@ void Pool2dGradKernel(const Context& ctx, true); } else if (pooling_type == "avg") { + // When output dim is 1 * 1 (1 * 1 * 1 in pool_3d), use scale + // and broadcast kernels to get same output, but better performance. + // Since the dim is special in particular models, + // use 'export XPU_POOLING_GRAD_SPECIAL=1' to open this path + if (out_h == 1 && out_w == 1 && std::is_same::value && + std::getenv("XPU_POOLING_GRAD_SPECIAL") != nullptr) { + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + float scale = 1.0 / (in_h * in_w); + float* scaled_dy = RAII_GUARD.alloc_l3_or_gm(n * c); + r = xpu::scale(ctx.x_context(), + dout.data(), + scaled_dy, + n * c, + true, + scale, + 0.0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + + r = xpu::broadcast(ctx.x_context(), + scaled_dy, + dx->data(), + {n, c, 1, 1}, + {n, c, in_h, in_w}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); + + return; + } r = xpu::adaptive_avg_pool2d_grad( ctx.x_context(), reinterpret_cast(dout.data()), @@ -267,6 +294,31 @@ void Pool3dGradKernel(const Context& ctx, !channel_last); } else if (pooling_type == "avg") { + if (out_d == 1 && out_h == 1 && out_w == 1 && + std::is_same::value && + std::getenv("XPU_POOLING_GRAD_SPECIAL") != nullptr) { + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + float scale = 1.0 / (in_d * in_h * in_w); + float* scaled_dy = RAII_GUARD.alloc_l3_or_gm(n * c); + r = xpu::scale(ctx.x_context(), + dout.data(), + scaled_dy, + n * c, + true, + scale, + 0.0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + + r = xpu::broadcast(ctx.x_context(), + scaled_dy, + dx->data(), + {n, c, 1, 1, 1}, + {n, c, in_d, in_h, in_w}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); + + return; + } + r = xpu::adaptive_avg_pool3d_grad( ctx.x_context(), reinterpret_cast(dout.data()), diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc index b9383f108eb..419ff72e640 100644 --- a/paddle/phi/kernels/xpu/tile_kernel.cc +++ b/paddle/phi/kernels/xpu/tile_kernel.cc @@ -29,6 +29,7 @@ void TileKernel(const Context& dev_ctx, const DenseTensor& x, const IntArray& repeat_times_arr, DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; auto rank = x.dims().size(); PADDLE_ENFORCE_GE( rank, @@ -104,12 +105,21 @@ void TileKernel(const Context& dev_ctx, if (repeat_times == temp) { out->Resize(x.dims()); dev_ctx.template Alloc(out); - int r = - xpu::copy(dev_ctx.x_context(), x.data(), out->data(), x.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + if (std::is_same::value) { + int r = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + 8 * x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + } else { + int r = xpu::copy( + dev_ctx.x_context(), x.data(), out->data(), x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + } return; } + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); int ret = XPU_SUCCESS; if (std::is_same::value) { ret = xpu::broadcast(dev_ctx.x_context(), @@ -118,6 +128,24 @@ void TileKernel(const Context& dev_ctx, vec_in_dims, vec_out_dims); + } else if (std::is_same::value) { + float* x_t = RAII_GUARD.alloc_l3_or_gm(x.numel()); + float* y_t = RAII_GUARD.alloc_l3_or_gm(out->numel()); + int r = + xpu::cast(dev_ctx.x_context(), + reinterpret_cast(x.data()), + x_t, + x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + ret = xpu::broadcast( + dev_ctx.x_context(), x_t, y_t, vec_in_dims, vec_out_dims); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast"); + r = xpu::cast(dev_ctx.x_context(), + y_t, + reinterpret_cast(out->data()), + out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + } else { ret = xpu::broadcast(dev_ctx.x_context(), x.data(), @@ -131,4 +159,5 @@ void TileKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - tile, XPU, ALL_LAYOUT, phi::TileKernel, bool, float, int, int64_t) {} + tile, XPU, ALL_LAYOUT, phi::TileKernel, bool, float, double, int, int64_t) { +} -- GitLab