diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 6d271766b0ed2de1853179e813034f23c4e46a1f..725983f8153e4f14f41552ae26800d363a863ec1 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Determine temporary device storage requirements size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0, + sizeof(T) * 8, ctx.stream()); // Allocate temporary storage auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, - idx_out, num); + idx_out, num, 0, sizeof(T) * 8, ctx.stream()); } template diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 1796a79b71b0683cb6159aa4cefcd2af7d6ba076..ffd9ac6b2af806ed92bea6484a077cc83de7af3c 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( nullptr, temp_storage_bytes, concat_scores.data(), keys_out, idx_in, - idx_out, total_roi_num); + idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort score to get corresponding index cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data(), - keys_out, idx_in, idx_out, total_roi_num); + keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8, + dev_ctx.stream()); index_out_t.Resize({real_post_num}); Tensor sorted_rois; sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); @@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, sorted_batch_id.data(), out_id_data, - batch_idx_in, index_out_t.data(), real_post_num); + batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort batch_id to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data(), - out_id_data, batch_idx_in, index_out_t.data(), real_post_num); + out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 1bec37e7112cc8bd112a9402f83ad9965bcef16d..7ccb354e1773a35957f9df97d26d79d50cbd4fd6 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -149,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, - target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -159,14 +159,14 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // sort target level to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int* restore_idx_data = restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); // sort current index to get restore index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, - restore_idx_data, roi_num); + restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum");