add gather_nd/tile op for kunlun (#37029)

819b9589 · TTerror · GitHub · 655f4e3f · 819b9589 · 819b9589
7 changed file
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,8 +35,7 @@ ELSE ()
 ENDIF()

 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211029")
-#SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211107")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

--- a/paddle/fluid/operators/gather_nd_op_xpu.cc
+++ b/paddle/fluid/operators/gather_nd_op_xpu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/gather_nd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherNdXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->template mutable_data<T>(ctx.GetPlace());
+    if (x->numel() == 0) return;
+
+    if (index->numel() == 0) {
+      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
+      return;
+    }
+
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s]",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    auto x_shape = paddle::framework::vectorize<int>(x->dims());
+    auto index_shape = paddle::framework::vectorize<int>(index->dims());
+    xpu::VectorParam<int> x_vec = {x_shape.data(),
+                                   static_cast<int>(x_shape.size()), nullptr};
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int ret = XPU_SUCCESS;
+    if (index_type == framework::proto::VarType::INT32) {
+      ret = xpu::gather_nd<T, int>(dev_ctx.x_context(), x->data<T>(),
+                                   index->data<int>(), out->data<T>(), x_vec,
+                                   index_shape);
+    } else {
+      ret = xpu::gather_nd<T, int64_t>(dev_ctx.x_context(), x->data<T>(),
+                                       index->data<int64_t>(), out->data<T>(),
+                                       x_vec, index_shape);
+    }
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU gather_nd kernel return wrong value[%d %s]", ret,
+                          XPUAPIErrorMsg[ret]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(gather_nd, ops::GatherNdXPUKernel<int>,
+                       ops::GatherNdXPUKernel<int64_t>,
+                       ops::GatherNdXPUKernel<float>);
+
+#endif
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -33,6 +33,7 @@ inline std::vector<int> get_repeat_times(
    auto* repeat_data = repeat_tensor->data<int>();
    framework::Tensor cpu_repeat_tensor;
    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
        platform::is_npu_place(repeat_tensor->place())) {
      TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
      repeat_data = cpu_repeat_tensor.data<int>();
@@ -50,6 +51,7 @@ inline std::vector<int> get_repeat_times(
    for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
      auto tensor = list_repeat_times_tensor[i];
      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
          platform::is_npu_place(tensor->place())) {
        framework::Tensor temp;
        TensorCopySync(*tensor, platform::CPUPlace(), &temp);

--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tile_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class TileXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1, platform::errors::InvalidArgument(
+                     "The rank of the input 'x' for tile op must be a positive "
+                     "integer, but the value received is %d.",
+                     rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto repeat_times = get_repeat_times(context);
+    int repeat_times_size = repeat_times.size();
+    PADDLE_ENFORCE_GE(
+        repeat_times_size, 1,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile "
+            "op must be positive, but the value received is %d.",
+            repeat_times_size));
+    PADDLE_ENFORCE_LE(
+        repeat_times_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times_size));
+
+    auto* in0 = context.Input<framework::Tensor>("X");
+    auto in_dims = in0->dims();
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          repeat_times[i], 0,
+          platform::errors::InvalidArgument(
+              "All elements of the input 'repeat_times' for tile op must "
+              "be positive integers, but the value received is %d.",
+              repeat_times[i]));
+    }
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    PADDLE_ENFORCE_EQ(
+        repeat_times.size(), vec_in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The rank (%d) of the input 'x' and the rank (%d) of the input "
+            "'repeat_times' for tile op must match after promotion.",
+            vec_in_dims.size(), repeat_times.size()));
+
+    auto* out0 = context.Output<framework::Tensor>("Out");
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+    auto vec_out_dims = framework::vectorize<int>(out_dims);
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    std::vector<int> temp(repeat_times.size(), 1);
+    if (repeat_times == temp) {
+      framework::TensorCopy(*in0, context.GetPlace(), dev_ctx, out0);
+      return;
+    }
+
+    int ret = XPU_SUCCESS;
+    if (std::is_same<T, bool>::value) {
+      ret = xpu::broadcast<int8_t>(
+          dev_ctx.x_context(), reinterpret_cast<const int8_t*>(in0->data<T>()),
+          reinterpret_cast<int8_t*>(out0->data<T>()), vec_in_dims,
+          vec_out_dims);
+
+    } else {
+      ret = xpu::broadcast<T>(dev_ctx.x_context(), in0->data<T>(),
+                              out0->data<T>(), vec_in_dims, vec_out_dims);
+    }
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External("XPU tile kernel return wrong value[%d %s]",
+                                   ret, XPUAPIErrorMsg[ret]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(tile, ops::TileXPUKernel<bool>, ops::TileXPUKernel<int>,
+                       ops::TileXPUKernel<int64_t>, ops::TileXPUKernel<float>);
+
+#endif
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -252,8 +252,16 @@ XPUOpMap& get_kl2_ops() {
                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                     pOpKernelType(vartype::FP16, XPUPlace())})}
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::FP32, XPUPlace())})}

      // AddMore
  };

--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle.fluid as fluid
+import paddle
+
+
+def gather_nd_grad(x, index):
+    dout_shape = index.shape[:-1] + x.shape[index.shape[-1]:]
+    numel = 1
+    for i in dout_shape:
+        numel = numel * i
+    dout = np.full(dout_shape, 1. / numel)
+    dx = np.full_like(x, 0)
+
+    index = tuple(index.reshape(-1, index.shape[-1]).T)
+    np.add.at(dx, index, dout)
+
+    return dx
+
+
+def test_class1(op_type, typename):
+    class TestGatherNdOpWithEmptyIndex(XPUOpTest):
+        #Index has empty element, which means copy entire tensor
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {
+                'X': xnp,
+                'Index': np.array([[], []]).astype("int32")
+            }
+            self.outputs = {
+                'Out': np.vstack((xnp[np.newaxis, :], xnp[np.newaxis, :]))
+            }
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_1".format(op_type, typename)
+    TestGatherNdOpWithEmptyIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithEmptyIndex
+
+
+def test_class2(op_type, typename):
+    class TestGatherNdOpWithIndex1(OpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+            self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_2".format(op_type, typename)
+    TestGatherNdOpWithIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithIndex1
+
+
+def test_class3(op_type, typename):
+    class TestGatherNdOpWithLowIndex(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1], [2]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+            self.x_grad = gather_nd_grad(xnp, index)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_3".format(op_type, typename)
+    TestGatherNdOpWithLowIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithLowIndex
+
+
+def test_class4(op_type, typename):
+    class TestGatherNdOpIndex1(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([1, 2]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_4".format(op_type, typename)
+    TestGatherNdOpIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpIndex1
+
+
+def test_class5(op_type, typename):
+    class TestGatherNdOpWithSameIndexAsX(OpTest):
+        #Index has same rank as X's rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1, 1], [2, 1]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_5".format(op_type, typename)
+    TestGatherNdOpWithSameIndexAsX.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithSameIndexAsX
+
+
+def test_class6(op_type, typename):
+    class TestGatherNdOpWithHighRankSame(OpTest):
+        #Both Index and X have high rank, and Rank(Index) = Rank(X)
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            shape = (5, 2, 3, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack([np.random.randint(
+                0, s, size=2) for s in shape]).T
+
+            self.inputs = {'X': xnp, 'Index': index.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_6".format(op_type, typename)
+    TestGatherNdOpWithHighRankSame.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankSame
+
+
+def test_class7(op_type, typename):
+    class TestGatherNdOpWithHighRankDiff(OpTest):
+        #Both Index and X have high rank, Rank(Index) < Rank(X)
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            shape = (2, 3, 4, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack(
+                [np.random.randint(
+                    0, s, size=200) for s in shape]).T
+            index_re = index.reshape([20, 5, 2, 5])
+
+            self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_7".format(op_type, typename)
+    TestGatherNdOpWithHighRankDiff.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankDiff
+
+
+class TestGatherNdAPI(unittest.TestCase):
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([[1]])
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+for _typename in {'float32', 'int', 'int64'}:
+    test_class1('gather_nd', _typename)
+    test_class2('gather_nd', _typename)
+    test_class3('gather_nd', _typename)
+    test_class4('gather_nd', _typename)
+    test_class5('gather_nd', _typename)
+    test_class6('gather_nd', _typename)
+    test_class7('gather_nd', _typename)
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import core
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+#Situation 1: repeat_times is a list (without tensor)
+class TestTileOpRank1(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {'repeat_times': self.repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+#with dimension expanding
+class TestTileOpRank2Expanding(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.repeat_times = [2, 2]
+
+
+class TestTileOpRank2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+class TestTileOpRank3_Corner(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (1, 1, 1)
+
+
+class TestTileOpRank3_Corner2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (2, 2)
+
+
+class TestTileOpRank3(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 15)
+        self.repeat_times = (2, 1, 4)
+
+
+class TestTileOpRank4(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.repeat_times = (3, 2, 1, 2)
+
+
+# Situation 2: repeat_times is a list (with tensor)
+class TestTileOpRank1_tensor_attr(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+        repeat_times_tensor = []
+        for index, ele in enumerate(self.repeat_times):
+            repeat_times_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'repeat_times_tensor': repeat_times_tensor,
+        }
+        self.attrs = {"repeat_times": self.infer_repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+        self.infer_repeat_times = [-1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [1, 1]
+        self.infer_repeat_times = [1, -1]
+
+
+class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+        self.infer_repeat_times = [-1, 3]
+
+
+# Situation 3: repeat_times is a tensor
+class TestTileOpRank1_tensor(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+# Situation 4: input x is Integer
+class TestTileOpInteger(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(4, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 5: input x is Integer
+class TestTileOpInt64_t(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 6: input x is Bool
+class TestTileOpBool(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("bool")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Test python API
+class TestTileAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard(paddle.XPUPlace(0)):
+            np_x = np.random.random([12, 14]).astype("float32")
+            x = paddle.to_tensor(np_x)
+
+            positive_2 = np.array([2]).astype("int32")
+            positive_2 = paddle.to_tensor(positive_2)
+
+            repeat_times = np.array([2, 3]).astype("int32")
+            repeat_times = paddle.to_tensor(repeat_times)
+
+            out_1 = paddle.tile(x, repeat_times=[2, 3])
+            out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
+            out_3 = paddle.tile(x, repeat_times=repeat_times)
+
+            assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
+
+
+if __name__ == "__main__":
+    unittest.main()