未验证 提交 c86e771e 编写于 作者: Z zlsh80826 提交者: GitHub

NMS Performance Optimization (#31634)

* replace mask vector to raw ptr

* launch nms on context stream

* remove redundant mask declaration
上级 50cafa0b
...@@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, ...@@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const T *boxes = proposals.data<T>(); const T *boxes = proposals.data<T>();
auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
framework::Vector<uint64_t> mask(boxes_num * col_blocks); auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
mask.CUDAMutableData(BOOST_GET_CONST(
platform::CUDAPlace, ctx.GetPlace())), NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
pixel_offset); boxes_num, nms_threshold, boxes, mask_dev, pixel_offset);
std::vector<uint64_t> remv(col_blocks); std::vector<uint64_t> remv(col_blocks);
memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
std::vector<uint64_t> mask_host(boxes_num * col_blocks);
memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev,
boxes_num * col_blocks * sizeof(uint64_t), ctx.stream());
std::vector<int> keep_vec; std::vector<int> keep_vec;
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) { for (int i = 0; i < boxes_num; i++) {
...@@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, ...@@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
if (!(remv[nblock] & (1ULL << inblock))) { if (!(remv[nblock] & (1ULL << inblock))) {
++num_to_keep; ++num_to_keep;
keep_vec.push_back(i); keep_vec.push_back(i);
uint64_t *p = &mask[0] + i * col_blocks; uint64_t *p = mask_host.data() + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j]; remv[j] |= p[j];
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册