diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 423449abff97dbf70d81314f852d9135e25f243f..b57c7dab3a086212deb9714b81a0f861dfeac5e2 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -67,7 +67,6 @@ void FetchOpHandle::RunImpl() { if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true); - dev_ctxes_.at(t.place())->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 159d1d5f4e70033fabf93514bd63b38f83675bff..dcd711a33ff3a35fdd51d11f54a3343a0bb491c9 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -63,15 +63,9 @@ struct CastToPyBufferImpl { auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto dev_ctx = static_cast( - pool.Get(tensor.place())); - - paddle::platform::GpuMemcpyAsync( - dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), - cudaMemcpyDeviceToHost, dev_ctx->stream()); - dev_ctx->Wait(); + paddle::platform::GpuMemcpySync(dst_ptr, src_ptr, + sizeof(CUR_TYPE) * tensor.numel(), + cudaMemcpyDeviceToHost); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray( self->Resize(framework::make_ddim(dims)); auto *dst = self->mutable_data(place); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto dev_ctx = - static_cast(pool.Get(place)); - paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), - cudaMemcpyHostToDevice, dev_ctx->stream()); - // NOTE: For safety, here wait the copy complete. - // It because the CPU array.data() could be destroyed after this method. - // If we make this method async, it could be copied data from a memory buffer - // that has been freed. - dev_ctx->Wait(); + paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), + cudaMemcpyHostToDevice); } template <> @@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray( self->Resize(framework::make_ddim(dims)); auto *dst = self->mutable_data(place); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto dev_ctx = - static_cast(pool.Get(place)); - paddle::platform::GpuMemcpyAsync(dst, array.data(), - sizeof(uint16_t) * array.size(), - cudaMemcpyHostToDevice, dev_ctx->stream()); - // NOTE: For safety, here wait the copy complete. - // It because the CPU array.data() could be destroyed after this method. - // If we make this method async, it could be copied data from a memory buffer - // that has been freed. - dev_ctx->Wait(); + paddle::platform::GpuMemcpySync(dst, array.data(), + sizeof(uint16_t) * array.size(), + cudaMemcpyHostToDevice); } template