From 61fe2198185bb92e43eb846617b5faf5d0e40eb9 Mon Sep 17 00:00:00 2001 From: Zero Rains Date: Mon, 10 Apr 2023 11:25:20 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204=20No.36?= =?UTF-8?q?=E3=80=91=E4=B8=BA=20Paddle=20=E4=BC=98=E5=8C=96=20tile=20op=20?= =?UTF-8?q?=E5=9C=A8=20GPU=20=E4=B8=8A=E7=9A=84=E8=AE=A1=E7=AE=97=E6=80=A7?= =?UTF-8?q?=E8=83=BD=20(#52482)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix divide zero bug for softmax_with_cross_entropy * change the single test way * can run but slow. the most important is that I do not know why it slow * remove some useless commet * change the copyright to correct * remove some useless change * if repeat_times == 1, we will not use BroadcastKernel --- paddle/phi/kernels/gpu/tile_kernel.cu | 84 ++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu index ba598862f59..be825eea499 100644 --- a/paddle/phi/kernels/gpu/tile_kernel.cu +++ b/paddle/phi/kernels/gpu/tile_kernel.cu @@ -13,10 +13,90 @@ // limitations under the License. #include "paddle/phi/kernels/tile_kernel.h" - #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tile_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" + +namespace phi { + +template +void TileKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& repeat_times, + DenseTensor* out) { + auto x_dims = x.dims(); + auto rank = x_dims.size(); + auto repeat_times_data = repeat_times.GetData(); + int repeat_times_size = repeat_times_data.size(); + rank = std::max(rank, repeat_times_size); + + if (rank == 0) { + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + return; + } + + for (size_t i = 0; i < repeat_times_data.size(); ++i) { + PADDLE_ENFORCE_GT( + repeat_times_data[i], + 0, + errors::InvalidArgument( + "All elements of the input 'repeat_times' for tile op must " + "be positive integers, but the value received is %d.", + repeat_times_data[i])); + } + + auto vec_x_dims = phi::vectorize(x_dims); + if (repeat_times_data.size() < vec_x_dims.size()) { + int diff = vec_x_dims.size() - repeat_times_data.size(); + repeat_times_data.insert(repeat_times_data.begin(), diff, 1); + } else { + int diff = repeat_times_data.size() - vec_x_dims.size(); + vec_x_dims.insert(vec_x_dims.begin(), diff, 1); + } + + PADDLE_ENFORCE_EQ( + repeat_times_data.size(), + vec_x_dims.size(), + errors::InvalidArgument( + "The rank (%d) of the input 'x' and the rank (%d) of the input " + "'repeat_times' for tile op must match after promotion.", + vec_x_dims.size(), + repeat_times_data.size())); + + DDim new_x_dims = make_ddim(vec_x_dims); + DDim out_dims(new_x_dims); + DenseTensor new_x = x; + vec_x_dims.insert(vec_x_dims.begin(), 1, 1); + for (size_t i = 0; i < repeat_times_data.size(); ++i) { + out_dims[i] *= repeat_times_data[i]; + new_x.Resize(make_ddim(vec_x_dims)); + std::vector ins = {&new_x}; + vec_x_dims[i] *= repeat_times_data[i]; + if (i != repeat_times_data.size() - 1) { + if (repeat_times_data[i] != 1) { + DenseTensor tmp_out; + tmp_out.Resize(make_ddim(vec_x_dims)); + dev_ctx.template Alloc(&tmp_out); + std::vector outs = {&tmp_out}; + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, i, kps::IdentityFunctor()); + tmp_out.Resize(out_dims); + new_x = tmp_out; + } + vec_x_dims[i] *= vec_x_dims[i + 1]; + vec_x_dims[i + 1] = 1; + } else { + out->Resize(make_ddim(vec_x_dims)); + dev_ctx.template Alloc(out); + std::vector outs = {out}; + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, i, kps::IdentityFunctor()); + out->Resize(out_dims); + } + } +} + +} // namespace phi PD_REGISTER_KERNEL(tile, GPU, -- GitLab