diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index bc74c80e0315fac6de3ca575d53b23965adf4179..1796a79b71b0683cb6159aa4cefcd2af7d6ba076 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { int threads = kNumCUDAThreads; // get length-based lod by batch ids - GetLengthLoD<<>>(real_post_num, out_id_data, - length_lod_data); + GetLengthLoD<<>>( + real_post_num, out_id_data, length_lod_data); std::vector length_lod_cpu(lod_size); memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place, length_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index cc61035309eaab31534119ab088bf537bf71c242..1bec37e7112cc8bd112a9402f83ad9965bcef16d 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int dist_blocks = NumBlocks(roi_num); int threads = kNumCUDAThreads; // get target levels and sub_lod list - GPUDistFpnProposalsHelper<<>>( + GPUDistFpnProposalsHelper<<>>( roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), sub_lod_list_data, target_lvls_data, pixel_offset); - dev_ctx.Wait(); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); Tensor index_in_t; @@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); + std::vector sub_lod_list_cpu(lod_size * num_level); + memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place, + sub_lod_list_data, sizeof(int) * lod_size * num_level, + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < num_level; ++i) { Tensor sub_lod = sub_lod_list.Slice(i, i + 1); - int* sub_lod_data = sub_lod.data(); // transfer length-based lod to offset-based lod std::vector offset(1, 0); - std::vector sub_lod_cpu(lod_size); - memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place, - sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); - dev_ctx.Wait(); for (int j = 0; j < lod_size; ++j) { - offset.emplace_back(offset.back() + sub_lod_cpu[j]); + offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]); } int sub_rois_num = offset.back(); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 8359fbab519b36f58fbeaf02082f02a1372993fc..e8ab628db16bdd591adb670bafc5e05aeac8efed 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { memory::Copy(place, rpn_roi_probs_data + num_proposals, place, scores.data(), sizeof(T) * scores.numel(), dev_ctx.stream()); - dev_ctx.Wait(); num_proposals += proposals.dims()[0]; offset.emplace_back(num_proposals); tmp_num.push_back(proposals.dims()[0]);