diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 27852d43948327c498e73a51e4151a25c31f64c3..6d271766b0ed2de1853179e813034f23c4e46a1f 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - framework::Vector mask(boxes_num * col_blocks); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, - mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace())), - pixel_offset); + auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); + + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, mask_dev, pixel_offset); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + std::vector mask_host(boxes_num * col_blocks); + memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev, + boxes_num * col_blocks * sizeof(uint64_t), ctx.stream()); + std::vector keep_vec; int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { @@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, if (!(remv[nblock] & (1ULL << inblock))) { ++num_to_keep; keep_vec.push_back(i); - uint64_t *p = &mask[0] + i * col_blocks; + uint64_t *p = mask_host.data() + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; }