提交 0c24b3f9 编写于 作者: Y Yu Yang

Clean memcpy async

上级 bfbbe19f
...@@ -67,7 +67,6 @@ void FetchOpHandle::RunImpl() { ...@@ -67,7 +67,6 @@ void FetchOpHandle::RunImpl() {
if (platform::is_gpu_place(t.place())) { if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true); TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
dev_ctxes_.at(t.place())->Wait();
#endif #endif
} else { } else {
tensors_[i].ShareDataWith(t); tensors_[i].ShareDataWith(t);
......
...@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
platform::DeviceContextPool &pool = paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
platform::DeviceContextPool::Instance(); sizeof(CUR_TYPE) * tensor.numel(),
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>( cudaMemcpyDeviceToHost);
pool.Get(tensor.place()));
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
dev_ctx->Wait();
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
...@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray( ...@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place); auto *dst = self->mutable_data<T>(place);
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); cudaMemcpyHostToDevice);
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <> template <>
...@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray( ...@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place); auto *dst = self->mutable_data<platform::float16>(place);
paddle::platform::GpuMemcpySync(dst, array.data(),
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(), sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice);
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <typename T> template <typename T>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册