// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/phi/kernels/multiclass_nms3_kernel.h" #include #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { template void MultiClassNMSKernel(const Context& ctx, const DenseTensor& bboxes, const DenseTensor& scores, const paddle::optional& rois_num, float score_threshold, int nums_top_k, int keep_top_k, float nms_threshold, bool normalized, float nms_eta, int background_label, DenseTensor* out, DenseTensor* index, DenseTensor* nms_rois_num) { using XPUT = typename XPUTypeTrait::Type; const XPUT* bboxes_data = reinterpret_cast(bboxes.data()); const XPUT* scores_data = reinterpret_cast(scores.data()); bool return_index = index != nullptr; bool has_rois_num = rois_num.get_ptr() != nullptr; bool return_rois_num = nms_rois_num != nullptr; auto score_dims = phi::vectorize(scores.dims()); auto score_size = score_dims.size(); bool is_lod = score_size == 2 ? true : false; int n = 0; int b = 0; int class_num = scores.dims()[1]; int out_dim = bboxes.dims()[2] + 2; int boxes_count = 0; std::vector rois_num_vec; rois_num_vec.clear(); if (is_lod) { if (has_rois_num) { phi::DenseTensor rois_num_host; rois_num_host.Resize(rois_num.get_ptr()->dims()); ctx.template HostAlloc(&rois_num_host); phi::Copy(ctx, *rois_num.get_ptr(), rois_num_host.place(), false, &rois_num_host); n = rois_num.get_ptr()->numel(); for (int i = 0; i < n; i++) { rois_num_vec.push_back(rois_num_host.data()[i]); boxes_count += rois_num_host.data()[i]; } } else { auto lod = bboxes.lod().back(); boxes_count = lod[lod.size() - 1]; n = lod.size() - 1; for (int i = 0; i < n; i++) { rois_num_vec.push_back(lod[i + 1] - lod[i]); } } PADDLE_ENFORCE_EQ(boxes_count == bboxes.dims()[0], true, phi::errors::InvalidArgument( "boxes_count should equal boxes->dims()[0].", "But received: (%d) and (%d)", boxes_count, bboxes.dims()[0])); PADDLE_ENFORCE_EQ( boxes_count == score_dims[0], true, phi::errors::InvalidArgument("boxes_count shuold equal score_dims[0].", "But received: (%d) and (%d)", boxes_count, score_dims[0])); } else { n = bboxes.dims()[0]; b = bboxes.dims()[1]; boxes_count = n * b; } std::vector outs_vec_; std::vector out_index_vec_; outs_vec_.resize(boxes_count * out_dim); out_index_vec_.resize(boxes_count); std::vector batch_starts; int r = 0; r = xpu::multiclass_nms(ctx.x_context(), bboxes_data, scores_data, rois_num_vec, outs_vec_, out_index_vec_, batch_starts, n, b, class_num, out_dim, nums_top_k, score_threshold, keep_top_k, nms_threshold, background_label, normalized, nms_eta, return_index, is_lod); PADDLE_ENFORCE_XDNN_SUCCESS(r, "multiclass_nms"); uint64_t num_kept = batch_starts.back(); if (num_kept == 0) { if (return_index) { // out_dim may be zero when there is no object in picture, so add some // zeros to it // caution: results may differ between cpu and xpu due to this operation out->Resize({1, out_dim}); ctx.template Alloc(out); T* out_ptr = out->template data(); std::vector temp_value(out_dim, 0.0f); memory_utils::Copy(ctx.GetPlace(), out_ptr, phi::CPUPlace(), temp_value.data(), 1 * out_dim * sizeof(T)); index->Resize({1, 1}); ctx.template Alloc(index); int* out_index_ptr = index->template data(); std::vector temp_idx(1, 0); memory_utils::Copy(ctx.GetPlace(), out_index_ptr, phi::CPUPlace(), temp_idx.data(), 1 * sizeof(int)); } else { out->Resize({1, 1}); T* od = ctx.template Alloc(out); od[0] = -1; batch_starts = {0, 1}; } } else { out->Resize({static_cast(num_kept), out_dim}); ctx.template Alloc(out); T* out_ptr = out->template data(); memory_utils::Copy(ctx.GetPlace(), out_ptr, phi::CPUPlace(), outs_vec_.data(), num_kept * out_dim * sizeof(T)); if (return_index) { index->Resize({static_cast(num_kept), 1}); ctx.template Alloc(index); int* out_index_ptr = index->template data(); memory_utils::Copy(ctx.GetPlace(), out_index_ptr, phi::CPUPlace(), out_index_vec_.data(), num_kept * sizeof(int)); } } if (return_rois_num) { nms_rois_num->Resize({n}); ctx.template Alloc(nms_rois_num); DenseTensor nms_rois_num_cpu; nms_rois_num_cpu.Resize({nms_rois_num->numel()}); ctx.template HostAlloc(&nms_rois_num_cpu); int* nms_rois_num_cpu_data = nms_rois_num_cpu.data(); for (int i = 1; i <= n; i++) { nms_rois_num_cpu_data[i - 1] = batch_starts[i] - batch_starts[i - 1]; } phi::Copy(ctx, nms_rois_num_cpu, nms_rois_num->place(), true, nms_rois_num); } LoD lod; if (num_kept == 0) { batch_starts[batch_starts.size() - 1] = 1; } lod.emplace_back(batch_starts); if (return_index) { index->set_lod(lod); } out->set_lod(lod); } } // namespace phi PD_REGISTER_KERNEL( multiclass_nms3, XPU, ALL_LAYOUT, phi::MultiClassNMSKernel, float) { kernel->OutputAt(1).SetDataType(phi::DataType::INT32); kernel->OutputAt(2).SetDataType(phi::DataType::INT32); }