From c86e771e9498674a3c8686f1a6d455ee6e294607 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 10:52:06 +0800 Subject: [PATCH] NMS Performance Optimization (#31634) * replace mask vector to raw ptr * launch nms on context stream * remove redundant mask declaration --- paddle/fluid/operators/detection/bbox_util.cu.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 27852d43948..6d271766b0e 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - framework::Vector mask(boxes_num * col_blocks); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, - mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace())), - pixel_offset); + auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); + + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, mask_dev, pixel_offset); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + std::vector mask_host(boxes_num * col_blocks); + memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev, + boxes_num * col_blocks * sizeof(uint64_t), ctx.stream()); + std::vector keep_vec; int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { @@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, if (!(remv[nblock] & (1ULL << inblock))) { ++num_to_keep; keep_vec.push_back(i); - uint64_t *p = &mask[0] + i * col_blocks; + uint64_t *p = mask_host.data() + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } -- GitLab