From 51a9fca3239ac7578eb739e0d44136ebeaec969d Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 24 Dec 2018 18:52:58 +0800 Subject: [PATCH] Async memory copy (#15013) --- paddle/fluid/inference/api/analysis_predictor.cc | 7 +++++-- paddle/fluid/inference/api/api_impl.cc | 7 +++++-- paddle/fluid/operators/detection/density_prior_box_op.cu | 5 +++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c751e85158..3937884ce4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, inputs[i].data.length()); } else { #ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); auto dst_gpu_place = boost::get(place_); memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), - inputs[i].data.length(), - 0); // stream 0 for sync copy + inputs[i].data.length(), dev_ctx->stream()); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 3d121e0460..102147a493 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, inputs[i].data.length()); } else { #ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); auto dst_gpu_place = boost::get(place_); memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), - inputs[i].data.length(), - 0); // stream 0 for sync copy + inputs[i].data.length(), dev_ctx->stream()); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 6a92762896..acd5993154 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { vars->mutable_data(ctx.GetPlace()); framework::Tensor d_temp; - framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp); // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. int blockx = std::min( - static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); + static_cast(((feature_width * num_priors + 31) >> 5) << 5), + 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); -- GitLab