diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c751e8515829d06970c55f097f50de8bf33ee2a4..3937884ce4a5a16a1093ac8977033eaa98b2678e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, inputs[i].data.length()); } else { #ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); auto dst_gpu_place = boost::get(place_); memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), - inputs[i].data.length(), - 0); // stream 0 for sync copy + inputs[i].data.length(), dev_ctx->stream()); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 3d121e046004dfe6fc6953e0b23852b9ecda5c1b..102147a493ed1454db1a78124200f163f68e555b 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, inputs[i].data.length()); } else { #ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); auto dst_gpu_place = boost::get(place_); memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), - inputs[i].data.length(), - 0); // stream 0 for sync copy + inputs[i].data.length(), dev_ctx->stream()); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 6a92762896b89a06a91cd11fb38587f7df69e6c3..acd5993154ed03f206f20082231feb5059ef32e1 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { vars->mutable_data(ctx.GetPlace()); framework::Tensor d_temp; - framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp); // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. int blockx = std::min( - static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); + static_cast(((feature_width * num_priors + 31) >> 5) << 5), + 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height);