From 50cafa0b0c03116903016552630a818230cce003 Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Fri, 19 Mar 2021 10:45:55 +0800
Subject: [PATCH] remove redundant sync, set collect/dist kernel to context
 stream, sub_lod memcpy opt (#31641)

---
 .../detection/collect_fpn_proposals_op.cu        |  4 ++--
 .../detection/distribute_fpn_proposals_op.cu     | 16 ++++++++--------
 .../operators/detection/generate_proposals_op.cu |  1 -
 3 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index bc74c80e03..1796a79b71 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int threads = kNumCUDAThreads;
 
     // get length-based lod by batch ids
-    GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
-                                      length_lod_data);
+    GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
+        real_post_num, out_id_data, length_lod_data);
     std::vector<int> length_lod_cpu(lod_size);
     memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
                  length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index cc61035309..1bec37e711 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int dist_blocks = NumBlocks(roi_num);
     int threads = kNumCUDAThreads;
     // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
+    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads, 0, dev_ctx.stream()>>>(
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
         sub_lod_list_data, target_lvls_data, pixel_offset);
-    dev_ctx.Wait();
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
     Tensor index_in_t;
@@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int start = 0;
     auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
 
+    std::vector<int> sub_lod_list_cpu(lod_size * num_level);
+    memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place,
+                 sub_lod_list_data, sizeof(int) * lod_size * num_level,
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
-      int* sub_lod_data = sub_lod.data<int>();
       // transfer length-based lod to offset-based lod
       std::vector<size_t> offset(1, 0);
-      std::vector<int> sub_lod_cpu(lod_size);
-      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
-                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-      dev_ctx.Wait();
       for (int j = 0; j < lod_size; ++j) {
-        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
+        offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]);
       }
 
       int sub_rois_num = offset.back();
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 8359fbab51..e8ab628db1 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
                    scores.data<T>(), sizeof(T) * scores.numel(),
                    dev_ctx.stream());
-      dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
       tmp_num.push_back(proposals.dims()[0]);
-- 
GitLab