remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641)

50cafa0b · zlsh80826 · GitHub · 1d197f6c · 50cafa0b · 50cafa0b
3 changed file
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    int threads = kNumCUDAThreads;

    // get length-based lod by batch ids
-    GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
-                                      length_lod_data);
+    GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
+        real_post_num, out_id_data, length_lod_data);
    std::vector<int> length_lod_cpu(lod_size);
    memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
                 length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    int dist_blocks = NumBlocks(roi_num);
    int threads = kNumCUDAThreads;
    // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
+    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads, 0, dev_ctx.stream()>>>(
        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
        sub_lod_list_data, target_lvls_data, pixel_offset);
-    dev_ctx.Wait();
    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());

    Tensor index_in_t;
@@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    int start = 0;
    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");

+    std::vector<int> sub_lod_list_cpu(lod_size * num_level);
+    memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place,
+                 sub_lod_list_data, sizeof(int) * lod_size * num_level,
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
    for (int i = 0; i < num_level; ++i) {
      Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
-      int* sub_lod_data = sub_lod.data<int>();
      // transfer length-based lod to offset-based lod
      std::vector<size_t> offset(1, 0);
-      std::vector<int> sub_lod_cpu(lod_size);
-      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
-                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-      dev_ctx.Wait();
      for (int j = 0; j < lod_size; ++j) {
-        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
+        offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]);
      }

      int sub_rois_num = offset.back();

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
                   scores.data<T>(), sizeof(T) * scores.numel(),
                   dev_ctx.stream());
-      dev_ctx.Wait();
      num_proposals += proposals.dims()[0];
      offset.emplace_back(num_proposals);
      tmp_num.push_back(proposals.dims()[0]);