diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index a0b99377109aef4776fadd68101d011a9191b1cc..2dfd9befdb7e536f388e439dc1449a709185509c 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -286,7 +286,8 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, } int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), - sizeof(int) * num_to_keep, 0); + sizeof(int) * num_to_keep, ctx.stream()); + ctx.Wait(); } template @@ -329,7 +330,8 @@ static std::pair ProposalForOneImage( int keep_num; const auto gpu_place = boost::get(ctx.GetPlace()); memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, - keep_num_t.data(), sizeof(int), 0); + keep_num_t.data(), sizeof(int), ctx.stream()); + ctx.Wait(); keep_index.Resize({keep_num}); Tensor scores_filter, proposals_filter; @@ -438,9 +440,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { Tensor &scores = box_score_pair.second; memory::Copy(place, rpn_rois_data + num_proposals * 4, place, - proposals.data(), sizeof(T) * proposals.numel(), 0); + proposals.data(), sizeof(T) * proposals.numel(), + dev_ctx.stream()); memory::Copy(place, rpn_roi_probs_data + num_proposals, place, - scores.data(), sizeof(T) * scores.numel(), 0); + scores.data(), sizeof(T) * scores.numel(), + dev_ctx.stream()); + dev_ctx.Wait(); num_proposals += proposals.dims()[0]; offset.emplace_back(num_proposals); }