From f836e7d2af684d4dda4ce4c7ccfdb7a1eb706799 Mon Sep 17 00:00:00 2001 From: lijin23 <41257772+lj970926@users.noreply.github.com> Date: Tue, 20 Jun 2023 11:10:19 +0800 Subject: [PATCH] [XPU][PHI Kernels] add unique kernel for xpu (#54758) * add unique kernel for xpu * add unique kernel for xpu * update uniittest * add xpu support for unique with axis --- paddle/phi/backends/xpu/xpu2_op_list.cc | 4 + paddle/phi/kernels/cpu/unique_kernel.cc | 4 - paddle/phi/kernels/xpu/unique_kernel.cc | 417 ++++++++++++++++++++++++ test/xpu/test_unique_op_xpu.py | 242 ++++++++++++++ 4 files changed, 663 insertions(+), 4 deletions(-) create mode 100644 paddle/phi/kernels/xpu/unique_kernel.cc create mode 100644 test/xpu/test_unique_op_xpu.py diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 8b9a9fba639..d64e67b92a7 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -864,6 +864,10 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"unbind", XPUKernelSet({phi::DataType::FLOAT32})}, {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})}, + {"unique", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::INT32, + phi::DataType::INT64})}, {"unsqueeze2_grad", XPUKernelSet({phi::DataType::FLOAT64, phi::DataType::INT64, diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc index 3e2d25936bb..1ea8452e1d1 100644 --- a/paddle/phi/kernels/cpu/unique_kernel.cc +++ b/paddle/phi/kernels/cpu/unique_kernel.cc @@ -80,10 +80,6 @@ void UniqueRawKernel(const Context& context, return; } - if (x.numel() == 0) { - context.template Alloc(out); - return; - } if (axis.empty()) { phi::VisitDataTypeTiny( dtype, diff --git a/paddle/phi/kernels/xpu/unique_kernel.cc b/paddle/phi/kernels/xpu/unique_kernel.cc new file mode 100644 index 00000000000..18ad41b14e8 --- /dev/null +++ b/paddle/phi/kernels/xpu/unique_kernel.cc @@ -0,0 +1,417 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/phi/kernels/unique_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/core/visit_type.h" + +namespace phi { + +template +void XPUFlattenUniqueKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + using XPUType = typename XPUTypeTrait::Type; + const auto* x_data = x.data(); + int64_t x_len = x.numel(); + int r = XPU_SUCCESS; + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int64_t unique_len_cpu = 0; + int64_t* unique_len_xpu = RAII_GUARD.alloc_l3_or_gm(1); + if (x_len != 0) { + r = xpu::unique_count( + dev_ctx.x_context(), + reinterpret_cast(x_data), + unique_len_xpu, + x_len, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + false); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "unique_count"); + memory_utils::Copy(phi::CPUPlace(), + &unique_len_cpu, + dev_ctx.GetPlace(), + unique_len_xpu, + sizeof(int64_t)); + } + out->Resize(phi::make_ddim({unique_len_cpu})); + auto* out_data = dev_ctx.template Alloc(out); + IndexT* indices_data = nullptr; + if (return_index) { + indices->Resize(phi::make_ddim({unique_len_cpu})); + indices_data = dev_ctx.template Alloc(indices); + } + + IndexT* inverse_data = nullptr; + if (return_inverse) { + index->Resize(phi::make_ddim({x_len})); + inverse_data = dev_ctx.template Alloc(index); + } + + IndexT* counts_data = nullptr; + if (return_counts) { + counts->Resize(phi::make_ddim({unique_len_cpu})); + counts_data = dev_ctx.template Alloc(counts); + } + if (x_len == 0) { + return; + } + r = xpu::unique_compute( + dev_ctx.x_context(), + reinterpret_cast(x_data), + reinterpret_cast(out_data), + x_len, + unique_len_cpu, + indices_data, + counts_data, + inverse_data, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + false); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "unique_compute"); +} + +template +void XPUDimUniqueKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + int axis, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + using XPUType = typename XPUTypeTrait::Type; + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int r = xpu::SUCCESS; + const auto* x_data = x.data(); + auto* x_trans_data = RAII_GUARD.alloc_l3_or_gm(x.numel()); + std::vector permute(x.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + if (axis != 0) { + auto x_shape = vectorize(x.dims()); + r = xpu::transpose(dev_ctx.x_context(), + reinterpret_cast(x_data), + x_trans_data, + x_shape, + permute); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + } else { + r = xpu::copy(dev_ctx.x_context(), + reinterpret_cast(x_data), + x_trans_data, + x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + } + + DDim x_trans_dims = x.dims(); + x_trans_dims[0] = x.dims()[axis]; + x_trans_dims[axis] = x.dims()[0]; + DDim x_trans_flat_dims = phi::flatten_to_2d(x_trans_dims, 1); + int64_t axis_len = x_trans_flat_dims[0]; + int64_t slice_size = x_trans_flat_dims[1]; + auto x_trans_flat_dims_vec = vectorize(x_trans_flat_dims); + + auto* sorted_axis_idx = RAII_GUARD.alloc_l3_or_gm(axis_len); + auto* sort_in_tmp = RAII_GUARD.alloc_l3_or_gm(axis_len); + auto* sort_out_tmp = RAII_GUARD.alloc_l3_or_gm(axis_len); + auto* x_trans_tmp = RAII_GUARD.alloc_l3_or_gm(x.numel()); + auto* ori_idx_xpu = RAII_GUARD.alloc_l3_or_gm(axis_len); + auto* ori_idx_xpu_tmp = RAII_GUARD.alloc_l3_or_gm(axis_len); + auto* sort_offset = RAII_GUARD.alloc_l3_or_gm(axis_len); + r = xpu::range( + dev_ctx.x_context(), sort_offset, 0, slice_size, axis_len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "range"); + + r = xpu::range(dev_ctx.x_context(), ori_idx_xpu, 0, 1, axis_len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "range"); + + // radix sort + for (int64_t i = slice_size - 1; i >= 0; --i) { + r = xpu::gather(dev_ctx.x_context(), + x_trans_data + i, + sort_offset, + sort_in_tmp, + {x.numel() - i}, + axis_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); + r = xpu::stable_sort(dev_ctx.x_context(), + sort_in_tmp, + sort_out_tmp, + sorted_axis_idx, + 1, + axis_len, + false); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "stable_sort"); + r = xpu::gather(dev_ctx.x_context(), + x_trans_data, + sorted_axis_idx, + x_trans_tmp, + x_trans_flat_dims_vec, + axis_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); + std::swap(x_trans_data, x_trans_tmp); + + r = xpu::gather(dev_ctx.x_context(), + ori_idx_xpu, + sorted_axis_idx, + ori_idx_xpu_tmp, + {axis_len}, + axis_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); + std::swap(ori_idx_xpu, ori_idx_xpu_tmp); + } + + // adjacent difference + int64_t compare_num = (axis_len - 1) * slice_size; + auto* compare_results = RAII_GUARD.alloc_l3_or_gm(compare_num); + if (compare_num > 0) { + r = xpu::broadcast_equal(dev_ctx.x_context(), + x_trans_data + slice_size, + x_trans_data, + compare_results, + {compare_num}, + {compare_num}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal"); + } + + std::vector unique_axis; + std::vector indices_cpu; + std::vector inverse_cpu(axis_len); + std::vector counts_cpu; + std::vector ori_idx_cpu(axis_len); + memory_utils::Copy(phi::CPUPlace(), + ori_idx_cpu.data(), + dev_ctx.GetPlace(), + ori_idx_xpu, + sizeof(IndexT) * axis_len); + unique_axis.push_back(0); + indices_cpu.push_back(ori_idx_cpu[0]); + inverse_cpu[ori_idx_cpu[0]] = 0; + IndexT unique_len = 1; + IndexT repeat_cnt = 1; + for (IndexT i = 1; i < axis_len; ++i) { + int cnt_cpu = 0; + int* cnt_xpu = RAII_GUARD.alloc_l3_or_gm(1); + r = xpu::nonzero_count(dev_ctx.x_context(), + compare_results + (i - 1) * slice_size, + cnt_xpu, + slice_size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count"); + memory_utils::Copy( + phi::CPUPlace(), &cnt_cpu, dev_ctx.GetPlace(), cnt_xpu, sizeof(int)); + if (cnt_cpu != slice_size) { + unique_axis.push_back(i); + indices_cpu.push_back(ori_idx_cpu[i]); + counts_cpu.push_back(repeat_cnt); + ++unique_len; + repeat_cnt = 1; + } else { + ++repeat_cnt; + } + inverse_cpu[ori_idx_cpu[i]] = unique_len - 1; + } + counts_cpu.push_back(repeat_cnt); + DDim out_dims = x.dims(); + out_dims[axis] = unique_len; + out->Resize(out_dims); + auto* out_data = dev_ctx.template Alloc(out); + + auto* unique_axis_idx_xpu = RAII_GUARD.alloc_l3_or_gm(unique_len); + auto* out_trans_data = + RAII_GUARD.alloc_l3_or_gm(unique_len * slice_size); + memory_utils::Copy(dev_ctx.GetPlace(), + unique_axis_idx_xpu, + phi::CPUPlace(), + unique_axis.data(), + unique_len * sizeof(IndexT)); + r = xpu::gather(dev_ctx.x_context(), + x_trans_data, + unique_axis_idx_xpu, + out_trans_data, + x_trans_flat_dims_vec, + unique_len, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather"); + DDim out_trans_dims = x_trans_dims; + out_trans_dims[0] = unique_len; + auto out_trans_dims_vec = vectorize(out_trans_dims); + if (axis != 0) { + r = xpu::transpose(dev_ctx.x_context(), + out_trans_data, + reinterpret_cast(out_data), + out_trans_dims_vec, + permute); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + } else { + r = xpu::copy(dev_ctx.x_context(), + out_trans_data, + reinterpret_cast(out_data), + unique_len * slice_size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + } + if (return_index) { + indices->Resize({unique_len}); + auto* indices_data = dev_ctx.template Alloc(indices); + memory_utils::Copy(dev_ctx.GetPlace(), + indices_data, + phi::CPUPlace(), + indices_cpu.data(), + sizeof(IndexT) * unique_len); + } + + if (return_inverse) { + index->Resize({axis_len}); + auto* reverse_data = dev_ctx.template Alloc(index); + memory_utils::Copy(dev_ctx.GetPlace(), + reverse_data, + phi::CPUPlace(), + inverse_cpu.data(), + sizeof(IndexT) * axis_len); + } + + if (return_counts) { + counts->Resize({unique_len}); + auto* counts_data = dev_ctx.template Alloc(counts); + memory_utils::Copy(dev_ctx.GetPlace(), + counts_data, + phi::CPUPlace(), + counts_cpu.data(), + sizeof(IndexT) * unique_len); + } +} + +template +void UniqueKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + bool is_sorted = true; + UniqueRawKernel(dev_ctx, + x, + return_index, + return_inverse, + return_counts, + axis, + dtype, + is_sorted, + out, + indices, + index, + counts); +} + +template +void UniqueRawKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_index, + bool return_inverse, + bool return_counts, + const std::vector& axis, + DataType dtype, + bool is_sorted, + DenseTensor* out, + DenseTensor* indices, + DenseTensor* index, + DenseTensor* counts) { + if (dtype == DataType::INT32) { + PADDLE_ENFORCE_LE( + x.numel(), + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + + if (axis.empty()) { + PD_VISIT_BASE_INTEGRAL_TYPES(dtype, "XPUFlattenUniqueKernelImpl", [&] { + XPUFlattenUniqueKernelImpl(dev_ctx, + x, + return_index, + return_inverse, + return_counts, + out, + indices, + index, + counts); + }); + } else { + int axis_value = axis[0]; + axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value; + PD_VISIT_BASE_INTEGRAL_TYPES(dtype, "XPUDimUniqueKernelImpl", [&] { + XPUDimUniqueKernelImpl(dev_ctx, + x, + return_index, + return_inverse, + return_counts, + axis_value, + out, + indices, + index, + counts); + }); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unique, XPU, ALL_LAYOUT, phi::UniqueKernel, float, int, int64_t) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + unique_raw, XPU, ALL_LAYOUT, phi::UniqueRawKernel, float, int, int64_t) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/test/xpu/test_unique_op_xpu.py b/test/xpu/test_unique_op_xpu.py new file mode 100644 index 00000000000..691c4bea4b0 --- /dev/null +++ b/test/xpu/test_unique_op_xpu.py @@ -0,0 +1,242 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from get_test_cover_info import ( + XPUOpTestWrapper, + create_test_class, + get_xpu_op_support_types, +) +from op_test_xpu import XPUOpTest + +import paddle +from paddle.fluid import core + +paddle.enable_static() + + +class XPUTestUniqueOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = "unique" + self.use_dynamic_create_class = False + + class TestUniqueOp(XPUOpTest): + def setUp(self): + self.op_type = "unique" + self.init_dtype() + self.init_config() + + def init_dtype(self): + self.dtype = self.in_type + + def init_config(self): + self.inputs = { + 'X': np.array([2, 3, 3, 1, 5, 3], dtype=self.dtype), + } + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + 'return_index': True, + 'return_inverse': True, + 'is_sorted': True, # is_sorted must be set to true to call paddle.unique rather than fluid.layers.unique + } + self.outputs = { + 'Out': np.array([1, 2, 3, 5], dtype=self.dtype), + 'Indices': np.array([3, 0, 1, 4], dtype='int32'), + 'Index': np.array([1, 2, 2, 0, 3, 2]), + } + + def test_check_output(self): + self.check_output_with_place(paddle.XPUPlace(0)) + + class TestOne(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': np.array([2], dtype=self.dtype), + } + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + 'return_index': True, + 'return_inverse': True, + 'is_sorted': True, + } + self.outputs = { + 'Out': np.array([2], dtype=self.dtype), + 'Indices': np.array([0], dtype='int32'), + 'Index': np.array([0], dtype='int32'), + } + + class TestRandom(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': (np.random.random([150]) * 100.0).astype(self.dtype) + } + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT64), + 'return_index': True, + 'return_inverse': True, + 'return_counts': True, + 'is_sorted': True, + } + np_unique, np_index, reverse_index, np_counts = np.unique( + self.inputs['X'], + True, + True, + True, + ) + + self.outputs = { + 'Out': np_unique, + 'Indices': np_index, + 'Index': reverse_index, + 'Counts': np_counts, + } + + class TestRandom2(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': (np.random.random([4, 7, 10]) * 100.0).astype(self.dtype) + } + unique, indices, inverse, counts = np.unique( + self.inputs['X'], + return_index=True, + return_inverse=True, + return_counts=True, + axis=None, + ) + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT64), + "return_index": True, + "return_inverse": True, + "return_counts": True, + "axis": None, + "is_sorted": True, + } + self.outputs = { + 'Out': unique, + 'Indices': indices, + "Index": inverse, + "Counts": counts, + } + + class TestEmpty(TestUniqueOp): + def init_config(self): + self.inputs = {'X': np.ones([0, 4], dtype=self.dtype)} + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT64), + 'return_index': True, + 'return_inverse': True, + 'return_counts': True, + 'is_sorted': True, + } + self.outputs = { + 'Out': np.ones([0], dtype=self.dtype), + 'Indices': np.ones([0], dtype=self.dtype), + 'Index': np.ones([0], dtype=self.dtype), + 'Counts': np.ones([0], dtype=self.dtype), + } + + class TestUniqueOpAxis1(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': (np.random.random([3, 8, 8]) * 100.0).astype(self.dtype) + } + unique, indices, inverse, counts = np.unique( + self.inputs['X'], + return_index=True, + return_inverse=True, + return_counts=True, + axis=1, + ) + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + "return_index": True, + "return_inverse": True, + "return_counts": True, + "axis": [1], + "is_sorted": True, + } + self.outputs = { + 'Out': unique, + 'Indices': indices, + "Index": inverse, + "Counts": counts, + } + + class TestUniqueOpAxis2(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': (np.random.random([1, 10]) * 100.0).astype(self.dtype) + } + + unique, indices, inverse, counts = np.unique( + self.inputs['X'], + return_index=True, + return_inverse=True, + return_counts=True, + axis=0, + ) + + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + "return_index": True, + "return_inverse": True, + "return_counts": True, + "axis": [0], + "is_sorted": True, + } + + self.outputs = { + 'Out': unique, + 'Indices': indices, + "Index": inverse, + "Counts": counts, + } + + class TestUniqueOpAxisNeg(TestUniqueOp): + def init_config(self): + self.inputs = { + 'X': (np.random.random([6, 1, 8]) * 100.0).astype(self.dtype) + } + unique, indices, inverse, counts = np.unique( + self.inputs['X'], + return_index=True, + return_inverse=True, + return_counts=True, + axis=-1, + ) + self.attrs = { + 'dtype': int(core.VarDesc.VarType.INT32), + "return_index": True, + "return_inverse": True, + "return_counts": True, + "axis": [-1], + "is_sorted": True, + } + self.outputs = { + 'Out': unique, + 'Indices': indices, + "Index": inverse, + "Counts": counts, + } + + +support_types = get_xpu_op_support_types("unique") +for stype in support_types: + create_test_class(globals(), XPUTestUniqueOp, stype) + + +if __name__ == "__main__": + unittest.main() -- GitLab