From f836e7d2af684d4dda4ce4c7ccfdb7a1eb706799 Mon Sep 17 00:00:00 2001
From: lijin23 <41257772+lj970926@users.noreply.github.com>
Date: Tue, 20 Jun 2023 11:10:19 +0800
Subject: [PATCH] [XPU][PHI Kernels] add unique kernel for xpu (#54758)

* add unique kernel for xpu

* add unique kernel for xpu

* update uniittest

* add xpu support for unique with axis
---
 paddle/phi/backends/xpu/xpu2_op_list.cc |   4 +
 paddle/phi/kernels/cpu/unique_kernel.cc |   4 -
 paddle/phi/kernels/xpu/unique_kernel.cc | 417 ++++++++++++++++++++++++
 test/xpu/test_unique_op_xpu.py          | 242 ++++++++++++++
 4 files changed, 663 insertions(+), 4 deletions(-)
 create mode 100644 paddle/phi/kernels/xpu/unique_kernel.cc
 create mode 100644 test/xpu/test_unique_op_xpu.py
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 8b9a9fba639..d64e67b92a7 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -864,6 +864,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"unbind", XPUKernelSet({phi::DataType::FLOAT32})},
       {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"unique",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
       {"unsqueeze2_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
index 3e2d25936bb..1ea8452e1d1 100644
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -80,10 +80,6 @@ void UniqueRawKernel(const Context& context,
     return;
   }
 
-  if (x.numel() == 0) {
-    context.template Alloc<T>(out);
-    return;
-  }
   if (axis.empty()) {
     phi::VisitDataTypeTiny(
         dtype,
diff --git a/paddle/phi/kernels/xpu/unique_kernel.cc b/paddle/phi/kernels/xpu/unique_kernel.cc
new file mode 100644
index 00000000000..18ad41b14e8
--- /dev/null
+++ b/paddle/phi/kernels/xpu/unique_kernel.cc
@@ -0,0 +1,417 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <climits>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "paddle/phi/kernels/unique_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/core/visit_type.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void XPUFlattenUniqueKernelImpl(const Context& dev_ctx,
+                                const DenseTensor& x,
+                                bool return_index,
+                                bool return_inverse,
+                                bool return_counts,
+                                DenseTensor* out,
+                                DenseTensor* indices,
+                                DenseTensor* index,
+                                DenseTensor* counts) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto* x_data = x.data<T>();
+  int64_t x_len = x.numel();
+  int r = XPU_SUCCESS;
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int64_t unique_len_cpu = 0;
+  int64_t* unique_len_xpu = RAII_GUARD.alloc_l3_or_gm<int64_t>(1);
+  if (x_len != 0) {
+    r = xpu::unique_count<XPUType, IndexT>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(x_data),
+        unique_len_xpu,
+        x_len,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        false);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "unique_count");
+    memory_utils::Copy(phi::CPUPlace(),
+                       &unique_len_cpu,
+                       dev_ctx.GetPlace(),
+                       unique_len_xpu,
+                       sizeof(int64_t));
+  }
+  out->Resize(phi::make_ddim({unique_len_cpu}));
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  IndexT* indices_data = nullptr;
+  if (return_index) {
+    indices->Resize(phi::make_ddim({unique_len_cpu}));
+    indices_data = dev_ctx.template Alloc<IndexT>(indices);
+  }
+
+  IndexT* inverse_data = nullptr;
+  if (return_inverse) {
+    index->Resize(phi::make_ddim({x_len}));
+    inverse_data = dev_ctx.template Alloc<IndexT>(index);
+  }
+
+  IndexT* counts_data = nullptr;
+  if (return_counts) {
+    counts->Resize(phi::make_ddim({unique_len_cpu}));
+    counts_data = dev_ctx.template Alloc<IndexT>(counts);
+  }
+  if (x_len == 0) {
+    return;
+  }
+  r = xpu::unique_compute<XPUType, IndexT>(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUType*>(x_data),
+      reinterpret_cast<XPUType*>(out_data),
+      x_len,
+      unique_len_cpu,
+      indices_data,
+      counts_data,
+      inverse_data,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      false);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "unique_compute");
+}
+
+template <typename Context, typename T, typename IndexT>
+void XPUDimUniqueKernelImpl(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            bool return_index,
+                            bool return_inverse,
+                            bool return_counts,
+                            int axis,
+                            DenseTensor* out,
+                            DenseTensor* indices,
+                            DenseTensor* index,
+                            DenseTensor* counts) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int r = xpu::SUCCESS;
+  const auto* x_data = x.data<T>();
+  auto* x_trans_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(x.numel());
+  std::vector<int> permute(x.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  if (axis != 0) {
+    auto x_shape = vectorize<int>(x.dims());
+    r = xpu::transpose<XPUType>(dev_ctx.x_context(),
+                                reinterpret_cast<const XPUType*>(x_data),
+                                x_trans_data,
+                                x_shape,
+                                permute);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+  } else {
+    r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                           reinterpret_cast<const XPUType*>(x_data),
+                           x_trans_data,
+                           x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  }
+
+  DDim x_trans_dims = x.dims();
+  x_trans_dims[0] = x.dims()[axis];
+  x_trans_dims[axis] = x.dims()[0];
+  DDim x_trans_flat_dims = phi::flatten_to_2d(x_trans_dims, 1);
+  int64_t axis_len = x_trans_flat_dims[0];
+  int64_t slice_size = x_trans_flat_dims[1];
+  auto x_trans_flat_dims_vec = vectorize<int>(x_trans_flat_dims);
+
+  auto* sorted_axis_idx = RAII_GUARD.alloc_l3_or_gm<IndexT>(axis_len);
+  auto* sort_in_tmp = RAII_GUARD.alloc_l3_or_gm<XPUType>(axis_len);
+  auto* sort_out_tmp = RAII_GUARD.alloc_l3_or_gm<XPUType>(axis_len);
+  auto* x_trans_tmp = RAII_GUARD.alloc_l3_or_gm<XPUType>(x.numel());
+  auto* ori_idx_xpu = RAII_GUARD.alloc_l3_or_gm<IndexT>(axis_len);
+  auto* ori_idx_xpu_tmp = RAII_GUARD.alloc_l3_or_gm<IndexT>(axis_len);
+  auto* sort_offset = RAII_GUARD.alloc_l3_or_gm<IndexT>(axis_len);
+  r = xpu::range<IndexT>(
+      dev_ctx.x_context(), sort_offset, 0, slice_size, axis_len);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "range");
+
+  r = xpu::range<IndexT>(dev_ctx.x_context(), ori_idx_xpu, 0, 1, axis_len);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "range");
+
+  // radix sort
+  for (int64_t i = slice_size - 1; i >= 0; --i) {
+    r = xpu::gather<XPUType, IndexT>(dev_ctx.x_context(),
+                                     x_trans_data + i,
+                                     sort_offset,
+                                     sort_in_tmp,
+                                     {x.numel() - i},
+                                     axis_len,
+                                     0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
+    r = xpu::stable_sort<XPUType, IndexT>(dev_ctx.x_context(),
+                                          sort_in_tmp,
+                                          sort_out_tmp,
+                                          sorted_axis_idx,
+                                          1,
+                                          axis_len,
+                                          false);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "stable_sort");
+    r = xpu::gather<XPUType, IndexT>(dev_ctx.x_context(),
+                                     x_trans_data,
+                                     sorted_axis_idx,
+                                     x_trans_tmp,
+                                     x_trans_flat_dims_vec,
+                                     axis_len,
+                                     0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
+    std::swap(x_trans_data, x_trans_tmp);
+
+    r = xpu::gather<IndexT, IndexT>(dev_ctx.x_context(),
+                                    ori_idx_xpu,
+                                    sorted_axis_idx,
+                                    ori_idx_xpu_tmp,
+                                    {axis_len},
+                                    axis_len,
+                                    0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
+    std::swap(ori_idx_xpu, ori_idx_xpu_tmp);
+  }
+
+  // adjacent difference
+  int64_t compare_num = (axis_len - 1) * slice_size;
+  auto* compare_results = RAII_GUARD.alloc_l3_or_gm<bool>(compare_num);
+  if (compare_num > 0) {
+    r = xpu::broadcast_equal<XPUType>(dev_ctx.x_context(),
+                                      x_trans_data + slice_size,
+                                      x_trans_data,
+                                      compare_results,
+                                      {compare_num},
+                                      {compare_num});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal");
+  }
+
+  std::vector<IndexT> unique_axis;
+  std::vector<IndexT> indices_cpu;
+  std::vector<IndexT> inverse_cpu(axis_len);
+  std::vector<IndexT> counts_cpu;
+  std::vector<IndexT> ori_idx_cpu(axis_len);
+  memory_utils::Copy(phi::CPUPlace(),
+                     ori_idx_cpu.data(),
+                     dev_ctx.GetPlace(),
+                     ori_idx_xpu,
+                     sizeof(IndexT) * axis_len);
+  unique_axis.push_back(0);
+  indices_cpu.push_back(ori_idx_cpu[0]);
+  inverse_cpu[ori_idx_cpu[0]] = 0;
+  IndexT unique_len = 1;
+  IndexT repeat_cnt = 1;
+  for (IndexT i = 1; i < axis_len; ++i) {
+    int cnt_cpu = 0;
+    int* cnt_xpu = RAII_GUARD.alloc_l3_or_gm<int>(1);
+    r = xpu::nonzero_count<bool>(dev_ctx.x_context(),
+                                 compare_results + (i - 1) * slice_size,
+                                 cnt_xpu,
+                                 slice_size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
+    memory_utils::Copy(
+        phi::CPUPlace(), &cnt_cpu, dev_ctx.GetPlace(), cnt_xpu, sizeof(int));
+    if (cnt_cpu != slice_size) {
+      unique_axis.push_back(i);
+      indices_cpu.push_back(ori_idx_cpu[i]);
+      counts_cpu.push_back(repeat_cnt);
+      ++unique_len;
+      repeat_cnt = 1;
+    } else {
+      ++repeat_cnt;
+    }
+    inverse_cpu[ori_idx_cpu[i]] = unique_len - 1;
+  }
+  counts_cpu.push_back(repeat_cnt);
+  DDim out_dims = x.dims();
+  out_dims[axis] = unique_len;
+  out->Resize(out_dims);
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+
+  auto* unique_axis_idx_xpu = RAII_GUARD.alloc_l3_or_gm<IndexT>(unique_len);
+  auto* out_trans_data =
+      RAII_GUARD.alloc_l3_or_gm<XPUType>(unique_len * slice_size);
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     unique_axis_idx_xpu,
+                     phi::CPUPlace(),
+                     unique_axis.data(),
+                     unique_len * sizeof(IndexT));
+  r = xpu::gather<XPUType, IndexT>(dev_ctx.x_context(),
+                                   x_trans_data,
+                                   unique_axis_idx_xpu,
+                                   out_trans_data,
+                                   x_trans_flat_dims_vec,
+                                   unique_len,
+                                   0);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
+  DDim out_trans_dims = x_trans_dims;
+  out_trans_dims[0] = unique_len;
+  auto out_trans_dims_vec = vectorize<int>(out_trans_dims);
+  if (axis != 0) {
+    r = xpu::transpose<XPUType>(dev_ctx.x_context(),
+                                out_trans_data,
+                                reinterpret_cast<XPUType*>(out_data),
+                                out_trans_dims_vec,
+                                permute);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+  } else {
+    r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                           out_trans_data,
+                           reinterpret_cast<XPUType*>(out_data),
+                           unique_len * slice_size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  }
+  if (return_index) {
+    indices->Resize({unique_len});
+    auto* indices_data = dev_ctx.template Alloc<IndexT>(indices);
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       indices_data,
+                       phi::CPUPlace(),
+                       indices_cpu.data(),
+                       sizeof(IndexT) * unique_len);
+  }
+
+  if (return_inverse) {
+    index->Resize({axis_len});
+    auto* reverse_data = dev_ctx.template Alloc<IndexT>(index);
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       reverse_data,
+                       phi::CPUPlace(),
+                       inverse_cpu.data(),
+                       sizeof(IndexT) * axis_len);
+  }
+
+  if (return_counts) {
+    counts->Resize({unique_len});
+    auto* counts_data = dev_ctx.template Alloc<IndexT>(counts);
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       counts_data,
+                       phi::CPUPlace(),
+                       counts_cpu.data(),
+                       sizeof(IndexT) * unique_len);
+  }
+}
+
+template <typename T, typename Context>
+void UniqueKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  bool return_index,
+                  bool return_inverse,
+                  bool return_counts,
+                  const std::vector<int>& axis,
+                  DataType dtype,
+                  DenseTensor* out,
+                  DenseTensor* indices,
+                  DenseTensor* index,
+                  DenseTensor* counts) {
+  bool is_sorted = true;
+  UniqueRawKernel<T, Context>(dev_ctx,
+                              x,
+                              return_index,
+                              return_inverse,
+                              return_counts,
+                              axis,
+                              dtype,
+                              is_sorted,
+                              out,
+                              indices,
+                              index,
+                              counts);
+}
+
+template <typename T, typename Context>
+void UniqueRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     bool return_index,
+                     bool return_inverse,
+                     bool return_counts,
+                     const std::vector<int>& axis,
+                     DataType dtype,
+                     bool is_sorted,
+                     DenseTensor* out,
+                     DenseTensor* indices,
+                     DenseTensor* index,
+                     DenseTensor* counts) {
+  if (dtype == DataType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel(),
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+
+  if (axis.empty()) {
+    PD_VISIT_BASE_INTEGRAL_TYPES(dtype, "XPUFlattenUniqueKernelImpl", [&] {
+      XPUFlattenUniqueKernelImpl<Context, T, data_t>(dev_ctx,
+                                                     x,
+                                                     return_index,
+                                                     return_inverse,
+                                                     return_counts,
+                                                     out,
+                                                     indices,
+                                                     index,
+                                                     counts);
+    });
+  } else {
+    int axis_value = axis[0];
+    axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value;
+    PD_VISIT_BASE_INTEGRAL_TYPES(dtype, "XPUDimUniqueKernelImpl", [&] {
+      XPUDimUniqueKernelImpl<Context, T, data_t>(dev_ctx,
+                                                 x,
+                                                 return_index,
+                                                 return_inverse,
+                                                 return_counts,
+                                                 axis_value,
+                                                 out,
+                                                 indices,
+                                                 index,
+                                                 counts);
+    });
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    unique, XPU, ALL_LAYOUT, phi::UniqueKernel, float, int, int64_t) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    unique_raw, XPU, ALL_LAYOUT, phi::UniqueRawKernel, float, int, int64_t) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/test/xpu/test_unique_op_xpu.py b/test/xpu/test_unique_op_xpu.py
new file mode 100644
index 00000000000..691c4bea4b0
--- /dev/null
+++ b/test/xpu/test_unique_op_xpu.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class XPUTestUniqueOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "unique"
+        self.use_dynamic_create_class = False
+
+    class TestUniqueOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "unique"
+            self.init_dtype()
+            self.init_config()
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def init_config(self):
+            self.inputs = {
+                'X': np.array([2, 3, 3, 1, 5, 3], dtype=self.dtype),
+            }
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT32),
+                'return_index': True,
+                'return_inverse': True,
+                'is_sorted': True,  # is_sorted must be set to true to call paddle.unique rather than fluid.layers.unique
+            }
+            self.outputs = {
+                'Out': np.array([1, 2, 3, 5], dtype=self.dtype),
+                'Indices': np.array([3, 0, 1, 4], dtype='int32'),
+                'Index': np.array([1, 2, 2, 0, 3, 2]),
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(paddle.XPUPlace(0))
+
+    class TestOne(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': np.array([2], dtype=self.dtype),
+            }
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT32),
+                'return_index': True,
+                'return_inverse': True,
+                'is_sorted': True,
+            }
+            self.outputs = {
+                'Out': np.array([2], dtype=self.dtype),
+                'Indices': np.array([0], dtype='int32'),
+                'Index': np.array([0], dtype='int32'),
+            }
+
+    class TestRandom(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': (np.random.random([150]) * 100.0).astype(self.dtype)
+            }
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT64),
+                'return_index': True,
+                'return_inverse': True,
+                'return_counts': True,
+                'is_sorted': True,
+            }
+            np_unique, np_index, reverse_index, np_counts = np.unique(
+                self.inputs['X'],
+                True,
+                True,
+                True,
+            )
+
+            self.outputs = {
+                'Out': np_unique,
+                'Indices': np_index,
+                'Index': reverse_index,
+                'Counts': np_counts,
+            }
+
+    class TestRandom2(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': (np.random.random([4, 7, 10]) * 100.0).astype(self.dtype)
+            }
+            unique, indices, inverse, counts = np.unique(
+                self.inputs['X'],
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+                axis=None,
+            )
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT64),
+                "return_index": True,
+                "return_inverse": True,
+                "return_counts": True,
+                "axis": None,
+                "is_sorted": True,
+            }
+            self.outputs = {
+                'Out': unique,
+                'Indices': indices,
+                "Index": inverse,
+                "Counts": counts,
+            }
+
+    class TestEmpty(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {'X': np.ones([0, 4], dtype=self.dtype)}
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT64),
+                'return_index': True,
+                'return_inverse': True,
+                'return_counts': True,
+                'is_sorted': True,
+            }
+            self.outputs = {
+                'Out': np.ones([0], dtype=self.dtype),
+                'Indices': np.ones([0], dtype=self.dtype),
+                'Index': np.ones([0], dtype=self.dtype),
+                'Counts': np.ones([0], dtype=self.dtype),
+            }
+
+    class TestUniqueOpAxis1(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': (np.random.random([3, 8, 8]) * 100.0).astype(self.dtype)
+            }
+            unique, indices, inverse, counts = np.unique(
+                self.inputs['X'],
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+                axis=1,
+            )
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT32),
+                "return_index": True,
+                "return_inverse": True,
+                "return_counts": True,
+                "axis": [1],
+                "is_sorted": True,
+            }
+            self.outputs = {
+                'Out': unique,
+                'Indices': indices,
+                "Index": inverse,
+                "Counts": counts,
+            }
+
+    class TestUniqueOpAxis2(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': (np.random.random([1, 10]) * 100.0).astype(self.dtype)
+            }
+
+            unique, indices, inverse, counts = np.unique(
+                self.inputs['X'],
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+                axis=0,
+            )
+
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT32),
+                "return_index": True,
+                "return_inverse": True,
+                "return_counts": True,
+                "axis": [0],
+                "is_sorted": True,
+            }
+
+            self.outputs = {
+                'Out': unique,
+                'Indices': indices,
+                "Index": inverse,
+                "Counts": counts,
+            }
+
+    class TestUniqueOpAxisNeg(TestUniqueOp):
+        def init_config(self):
+            self.inputs = {
+                'X': (np.random.random([6, 1, 8]) * 100.0).astype(self.dtype)
+            }
+            unique, indices, inverse, counts = np.unique(
+                self.inputs['X'],
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+                axis=-1,
+            )
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.INT32),
+                "return_index": True,
+                "return_inverse": True,
+                "return_counts": True,
+                "axis": [-1],
+                "is_sorted": True,
+            }
+            self.outputs = {
+                'Out': unique,
+                'Indices': indices,
+                "Index": inverse,
+                "Counts": counts,
+            }
+
+
+support_types = get_xpu_op_support_types("unique")
+for stype in support_types:
+    create_test_class(globals(), XPUTestUniqueOp, stype)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab