From 1c67cf0c987b0b47f846554c148690a4ef08b9d4 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 15:27:23 +0800 Subject: [PATCH] run radix sort of proposals layer on context stream (#31631) --- paddle/fluid/operators/detection/bbox_util.cu.h | 5 +++-- .../operators/detection/collect_fpn_proposals_op.cu | 11 +++++++---- .../detection/distribute_fpn_proposals_op.cu | 10 +++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 6d271766b0e..725983f8153 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Determine temporary device storage requirements size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0, + sizeof(T) * 8, ctx.stream()); // Allocate temporary storage auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, - idx_out, num); + idx_out, num, 0, sizeof(T) * 8, ctx.stream()); } template diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 1796a79b71b..ffd9ac6b2af 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( nullptr, temp_storage_bytes, concat_scores.data(), keys_out, idx_in, - idx_out, total_roi_num); + idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort score to get corresponding index cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data(), - keys_out, idx_in, idx_out, total_roi_num); + keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8, + dev_ctx.stream()); index_out_t.Resize({real_post_num}); Tensor sorted_rois; sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); @@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, sorted_batch_id.data(), out_id_data, - batch_idx_in, index_out_t.data(), real_post_num); + batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort batch_id to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data(), - out_id_data, batch_idx_in, index_out_t.data(), real_post_num); + out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 1bec37e7112..7ccb354e177 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -149,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, - target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -159,14 +159,14 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // sort target level to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int* restore_idx_data = restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); // sort current index to get restore index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, - restore_idx_data, roi_num); + restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); -- GitLab