diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 6d271766b0ed2de1853179e813034f23c4e46a1f..725983f8153e4f14f41552ae26800d363a863ec1 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0,
+      sizeof(T) * 8, ctx.stream());
   // Allocate temporary storage
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
@@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
       d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
+      idx_out, num, 0, sizeof(T) * 8, ctx.stream());
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 1796a79b71b0683cb6159aa4cefcd2af7d6ba076..ffd9ac6b2af806ed92bea6484a077cc83de7af3c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     size_t temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
-        idx_out, total_roi_num);
+        idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort score to get corresponding index
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
-        keys_out, idx_in, idx_out, total_roi_num);
+        keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8,
+        dev_ctx.stream());
     index_out_t.Resize({real_post_num});
     Tensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
@@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairs<int, int>(
         nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
-        batch_idx_in, index_out_t.data<int>(), real_post_num);
+        batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort batch_id to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
-        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
+        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
 
     GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1bec37e7112cc8bd112a9402f83ad9965bcef16d..7ccb354e1773a35957f9df97d26d79d50cbd4fd6 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -149,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
-                                              target_lvls_data, keys_out,
-                                              idx_in, idx_out, roi_num);
+    cub::DeviceRadixSort::SortPairs<int, int>(
+        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
+        idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -159,14 +159,14 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort target level to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
-        idx_in, idx_out, roi_num);
+        idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int* restore_idx_data =
         restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
     // sort current index to get restore index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
-        restore_idx_data, roi_num);
+        restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int start = 0;
     auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");