From 61fe2198185bb92e43eb846617b5faf5d0e40eb9 Mon Sep 17 00:00:00 2001
From: Zero Rains <me@zerorains.top>
Date: Mon, 10 Apr 2023 11:25:20 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204=20No.36?=
 =?UTF-8?q?=E3=80=91=E4=B8=BA=20Paddle=20=E4=BC=98=E5=8C=96=20tile=20op=20?=
 =?UTF-8?q?=E5=9C=A8=20GPU=20=E4=B8=8A=E7=9A=84=E8=AE=A1=E7=AE=97=E6=80=A7?=
 =?UTF-8?q?=E8=83=BD=20(#52482)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix divide zero bug for softmax_with_cross_entropy

* change the single test way

* can run but slow. the most important is that I do not know why it slow

* remove some useless commet

* change the copyright to correct

* remove some useless change

* if repeat_times == 1, we will not use BroadcastKernel
---
 paddle/phi/kernels/gpu/tile_kernel.cu | 84 ++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 2 deletions(-)
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index ba598862f59..be825eea499 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -13,10 +13,90 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/tile_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const IntArray& repeat_times,
+                DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto rank = x_dims.size();
+  auto repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  if (rank == 0) {
+    phi::Copy<DeviceContext>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    return;
+  }
+
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times_data[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times_data[i]));
+  }
+
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      repeat_times_data.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times_data.size()));
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  DenseTensor new_x = x;
+  vec_x_dims.insert(vec_x_dims.begin(), 1, 1);
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    out_dims[i] *= repeat_times_data[i];
+    new_x.Resize(make_ddim(vec_x_dims));
+    std::vector<const DenseTensor*> ins = {&new_x};
+    vec_x_dims[i] *= repeat_times_data[i];
+    if (i != repeat_times_data.size() - 1) {
+      if (repeat_times_data[i] != 1) {
+        DenseTensor tmp_out;
+        tmp_out.Resize(make_ddim(vec_x_dims));
+        dev_ctx.template Alloc<T>(&tmp_out);
+        std::vector<DenseTensor*> outs = {&tmp_out};
+        phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+            dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+        tmp_out.Resize(out_dims);
+        new_x = tmp_out;
+      }
+      vec_x_dims[i] *= vec_x_dims[i + 1];
+      vec_x_dims[i + 1] = 1;
+    } else {
+      out->Resize(make_ddim(vec_x_dims));
+      dev_ctx.template Alloc<T>(out);
+      std::vector<DenseTensor*> outs = {out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+      out->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(tile,
                    GPU,
-- 
GitLab