migrate sigmoid with cross entropy, and tile xpu kernels to phi, test=kunlun (#45621)

65e9bd90 · ykkk2333 · GitHub · 0b9d4c56 · 0b9d4c56 · 0b9d4c56
5 changed file
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-    // input and output data
-    auto* input = context.Input<Tensor>("X");
-    auto* label = context.Input<Tensor>("Label");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // attrs
-    int ignore_index = context.Attr<int>("ignore_index");
-    bool normalize = context.Attr<bool>("normalize");
-    // allocate temp memory
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* hit = RAII_GUARD.alloc_l3_or_gm<int>(input->numel());
-    PADDLE_ENFORCE_NOT_NULL(
-        hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
-    int r = xpu::sigmoid_cross_entropy_with_logits(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(input->data<T>()),
-        reinterpret_cast<const XPUType*>(label->data<T>()),
-        reinterpret_cast<XPUType*>(output->data<T>()),
-        1,
-        input->numel(),
-        hit,
-        ignore_index);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
-    if (normalize) {
-      int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
-      PADDLE_ENFORCE_NOT_NULL(
-          non_zero,
-          platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
-      int r = xpu::nonzero_count(dev_ctx.x_context(),
-                                 reinterpret_cast<const XPUType*>(hit),
-                                 non_zero,
-                                 input->numel());
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
-      int non_zero_cpu = 0;
-      memory::Copy(platform::CPUPlace(),
-                   static_cast<void*>(&non_zero_cpu),
-                   context.GetPlace(),
-                   static_cast<void*>(non_zero),
-                   sizeof(int));
-      r = xpu::scale(dev_ctx.x_context(),
-                     reinterpret_cast<const XPUType*>(output->data<T>()),
-                     reinterpret_cast<XPUType*>(output->data<T>()),
-                     input->numel(),
-                     false,
-                     1.0f / static_cast<float>(non_zero_cpu),
-                     0.0f);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsGradXPUKernel
-    : public framework::OpKernel<T> {
-  using XPUType = typename XPUTypeTrait<T>::Type;
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_xpu_place(context.GetPlace()),
-        true,
-        platform::errors::Unavailable("This kernel only runs on XPU."));
-    // input and output data
-    auto* input = context.Input<Tensor>("X");
-    auto* label = context.Input<Tensor>("Label");
-    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // attrs
-    int ignore_index = context.Attr<int>("ignore_index");
-    bool normalize = context.Attr<bool>("normalize");
-    // allocate temp memory
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* hit = RAII_GUARD.alloc_l3_or_gm<int>(input->numel());
-    PADDLE_ENFORCE_NOT_NULL(
-        hit, platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
-    int r = xpu::sigmoid_cross_entropy_with_logits_grad(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(input->data<T>()),
-        reinterpret_cast<const XPUType*>(label->data<T>()),
-        reinterpret_cast<const XPUType*>(dy->data<T>()),
-        reinterpret_cast<XPUType*>(dx->data<T>()),
-        1,
-        input->numel(),
-        hit,
-        ignore_index);
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
-    if (normalize) {
-      int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
-      PADDLE_ENFORCE_NOT_NULL(
-          non_zero,
-          platform::errors::External("XPU alloc_l3_or_gm returns nullptr"));
-      int r = xpu::nonzero_count(dev_ctx.x_context(),
-                                 reinterpret_cast<const XPUType*>(hit),
-                                 non_zero,
-                                 input->numel());
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
-      int non_zero_cpu = 0;
-      memory::Copy(platform::CPUPlace(),
-                   static_cast<void*>(&non_zero_cpu),
-                   context.GetPlace(),
-                   static_cast<void*>(non_zero),
-                   sizeof(int));
-      r = xpu::scale(dev_ctx.x_context(),
-                     reinterpret_cast<const XPUType*>(dx->data<T>()),
-                     reinterpret_cast<XPUType*>(dx->data<T>()),
-                     input->numel(),
-                     false,
-                     1.0f / static_cast<float>(non_zero_cpu),
-                     0.0f);
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsXPUKernel<
-                           paddle::platform::XPUDeviceContext,
-                           float>);
-REGISTER_OP_XPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                       ops::SigmoidCrossEntropyWithLogitsGradXPUKernel<
-                           paddle::platform::XPUDeviceContext,
-                           float>);
-#endif
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/tile_op_functor.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-template <typename T>
-class TileXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op must be a positive "
-            "integer, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            repeat_times_size));
-    auto* in0 = context.Input<framework::Tensor>("X");
-    auto in_dims = in0->dims();
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i],
-          0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(),
-        vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(),
-            repeat_times.size()));
-    auto* out0 = context.Output<framework::Tensor>("Out");
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-    auto vec_out_dims = phi::vectorize<int>(out_dims);
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    std::vector<int> temp(repeat_times.size(), 1);
-    if (repeat_times == temp) {
-      framework::TensorCopy(*in0, context.GetPlace(), dev_ctx, out0);
-      return;
-    }
-    int ret = XPU_SUCCESS;
-    if (std::is_same<T, bool>::value) {
-      ret = xpu::broadcast<int8_t>(
-          dev_ctx.x_context(),
-          reinterpret_cast<const int8_t*>(in0->data<T>()),
-          reinterpret_cast<int8_t*>(out0->data<T>()),
-          vec_in_dims,
-          vec_out_dims);
-    } else {
-      ret = xpu::broadcast<T>(dev_ctx.x_context(),
-                              in0->data<T>(),
-                              out0->data<T>(),
-                              vec_in_dims,
-                              vec_out_dims);
-    }
-    PADDLE_ENFORCE_EQ(
-        ret,
-        XPU_SUCCESS,
-        platform::errors::External("XPU tile kernel return wrong value[%d %s]",
-                                   ret,
-                                   XPUAPIErrorMsg[ret]));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(tile,
-                       ops::TileXPUKernel<bool>,
-                       ops::TileXPUKernel<int>,
-                       ops::TileXPUKernel<int64_t>,
-                       ops::TileXPUKernel<float>);
-#endif
--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+namespace phi {
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& label,
+                                             const DenseTensor& out_grad,
+                                             bool normalize,
+                                             int ignore_index,
+                                             DenseTensor* in_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  PADDLE_ENFORCE_EQ(x.place().GetType() == phi::AllocationType::XPU,
+                    true,
+                    errors::Unavailable("This kernel only runs on XPU."));
+  dev_ctx.template Alloc<T>(in_grad);
+  // allocate temp memory
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* hit = RAII_GUARD.alloc_l3_or_gm<int>(x.numel());
+  PADDLE_ENFORCE_NOT_NULL(
+      hit, errors::External("XPU alloc_l3_or_gm returns nullptr"));
+  int r = xpu::sigmoid_cross_entropy_with_logits_grad(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(x.data<T>()),
+      reinterpret_cast<const XPUType*>(label.data<T>()),
+      reinterpret_cast<const XPUType*>(out_grad.data<T>()),
+      reinterpret_cast<XPUType*>(in_grad->data<T>()),
+      1,
+      x.numel(),
+      hit,
+      ignore_index);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
+  if (normalize) {
+    int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
+    PADDLE_ENFORCE_NOT_NULL(
+        non_zero, errors::External("XPU alloc_l3_or_gm returns nullptr"));
+    int r = xpu::nonzero_count(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(hit),
+                               non_zero,
+                               x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
+    int non_zero_cpu = 0;
+    paddle::memory::Copy(CPUPlace(),
+                         static_cast<void*>(&non_zero_cpu),
+                         dev_ctx.GetPlace(),
+                         static_cast<void*>(non_zero),
+                         sizeof(int));
+    r = xpu::scale(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(in_grad->data<T>()),
+                   reinterpret_cast<XPUType*>(in_grad->data<T>()),
+                   x.numel(),
+                   false,
+                   1.0f / static_cast<float>(non_zero_cpu),
+                   0.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsGradKernel,
+                   float) {}
--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+namespace phi {
+template <typename T, typename Context>
+void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& label,
+                                         bool normalize,
+                                         int ignore_index,
+                                         DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  PADDLE_ENFORCE_EQ(x.place().GetType() == phi::AllocationType::XPU,
+                    true,
+                    errors::Unavailable("This kernel only runs on XPU."));
+  dev_ctx.template Alloc<T>(out);
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* hit = RAII_GUARD.alloc_l3_or_gm<int>(x.numel());
+  PADDLE_ENFORCE_NOT_NULL(
+      hit, errors::External("XPU alloc_l3_or_gm returns nullptr"));
+  int r = xpu::sigmoid_cross_entropy_with_logits(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(x.data<T>()),
+      reinterpret_cast<const XPUType*>(label.data<T>()),
+      reinterpret_cast<XPUType*>(out->data<T>()),
+      1,
+      x.numel(),
+      hit,
+      ignore_index);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid_cross_entropy_with_logits");
+  if (normalize) {
+    int* non_zero = RAII_GUARD.alloc_l3_or_gm<int>(1);
+    PADDLE_ENFORCE_NOT_NULL(
+        non_zero, errors::External("XPU alloc_l3_or_gm returns nullptr"));
+    int r = xpu::nonzero_count(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(hit),
+                               non_zero,
+                               x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
+    int non_zero_cpu = 0;
+    paddle::memory::Copy(CPUPlace(),
+                         static_cast<void*>(&non_zero_cpu),
+                         dev_ctx.GetPlace(),
+                         static_cast<void*>(non_zero),
+                         sizeof(int));
+    r = xpu::scale(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(out->data<T>()),
+                   reinterpret_cast<XPUType*>(out->data<T>()),
+                   x.numel(),
+                   false,
+                   1.0f / static_cast<float>(non_zero_cpu),
+                   0.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SigmoidCrossEntropyWithLogitsKernel,
+                   float) {}
--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <type_traits>
+#include <vector>
+#include "paddle/phi/kernels/tile_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+namespace phi {
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const IntArray& repeat_times_arr,
+                DenseTensor* out) {
+  auto rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op must be a positive "
+          "integer, but the value received is %d.",
+          rank));
+  PADDLE_ENFORCE_LE(
+      rank,
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The rank of the input 'x' for tile op "
+          "must be less than or equal to %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          rank));
+  std::vector<int64_t> repeat_times = repeat_times_arr.GetData();
+  int repeat_times_size = repeat_times.size();
+  PADDLE_ENFORCE_GE(
+      repeat_times_size,
+      1,
+      errors::InvalidArgument(
+          "The number of elements of the input 'repeat_times' for tile "
+          "op must be positive, but the value received is %d.",
+          repeat_times_size));
+  PADDLE_ENFORCE_LE(
+      repeat_times_size,
+      MAX_RANK_SUPPORTED,
+      errors::InvalidArgument(
+          "The number of elements of the input 'repeat_times' for tile op "
+          "must be less than or equal to %d, but the value received is %d.",
+          MAX_RANK_SUPPORTED,
+          repeat_times_size));
+  auto in_dims = x.dims();
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times[i]));
+  }
+  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  if (repeat_times.size() < vec_in_dims.size()) {
+    int diff = vec_in_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  }
+  PADDLE_ENFORCE_EQ(
+      repeat_times.size(),
+      vec_in_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_in_dims.size(),
+          repeat_times.size()));
+  DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  DDim out_dims(new_in_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+  auto vec_out_dims = phi::vectorize<int>(out_dims);
+  out->Resize(out_dims);
+  dev_ctx.template Alloc<T>(out);
+  std::vector<int64_t> temp(repeat_times.size(), 1);
+  if (repeat_times == temp) {
+    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    return;
+  }
+  int ret = XPU_SUCCESS;
+  if (std::is_same<T, bool>::value) {
+    ret = xpu::broadcast<int8_t>(dev_ctx.x_context(),
+                                 reinterpret_cast<const int8_t*>(x.data<T>()),
+                                 reinterpret_cast<int8_t*>(out->data<T>()),
+                                 vec_in_dims,
+                                 vec_out_dims);
+  } else {
+    ret = xpu::broadcast<T>(dev_ctx.x_context(),
+                            x.data<T>(),
+                            out->data<T>(),
+                            vec_in_dims,
+                            vec_out_dims);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "broadcast");
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    tile, XPU, ALL_LAYOUT, phi::TileKernel, bool, float, int, int64_t) {}