From 50cafa0b0c03116903016552630a818230cce003 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 10:45:55 +0800 Subject: [PATCH] remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641) --- .../detection/collect_fpn_proposals_op.cu | 4 ++-- .../detection/distribute_fpn_proposals_op.cu | 16 ++++++++-------- .../operators/detection/generate_proposals_op.cu | 1 - 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index bc74c80e031..1796a79b71b 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { int threads = kNumCUDAThreads; // get length-based lod by batch ids - GetLengthLoD<<>>(real_post_num, out_id_data, - length_lod_data); + GetLengthLoD<<>>( + real_post_num, out_id_data, length_lod_data); std::vector length_lod_cpu(lod_size); memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place, length_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index cc61035309e..1bec37e7112 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int dist_blocks = NumBlocks(roi_num); int threads = kNumCUDAThreads; // get target levels and sub_lod list - GPUDistFpnProposalsHelper<<>>( + GPUDistFpnProposalsHelper<<>>( roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), sub_lod_list_data, target_lvls_data, pixel_offset); - dev_ctx.Wait(); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); Tensor index_in_t; @@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); + std::vector sub_lod_list_cpu(lod_size * num_level); + memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place, + sub_lod_list_data, sizeof(int) * lod_size * num_level, + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < num_level; ++i) { Tensor sub_lod = sub_lod_list.Slice(i, i + 1); - int* sub_lod_data = sub_lod.data(); // transfer length-based lod to offset-based lod std::vector offset(1, 0); - std::vector sub_lod_cpu(lod_size); - memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place, - sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); - dev_ctx.Wait(); for (int j = 0; j < lod_size; ++j) { - offset.emplace_back(offset.back() + sub_lod_cpu[j]); + offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]); } int sub_rois_num = offset.back(); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 8359fbab519..e8ab628db16 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { memory::Copy(place, rpn_roi_probs_data + num_proposals, place, scores.data(), sizeof(T) * scores.numel(), dev_ctx.stream()); - dev_ctx.Wait(); num_proposals += proposals.dims()[0]; offset.emplace_back(num_proposals); tmp_num.push_back(proposals.dims()[0]); -- GitLab