未验证 提交 01da2584 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #10202 from reyoung/feature/clean_memcpy_async

Clean memcpy async
...@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
platform::DeviceContextPool &pool = paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
platform::DeviceContextPool::Instance(); sizeof(CUR_TYPE) * tensor.numel(),
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>( cudaMemcpyDeviceToHost);
pool.Get(tensor.place()));
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
dev_ctx->Wait();
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
...@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray( ...@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place); auto *dst = self->mutable_data<T>(place);
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); cudaMemcpyHostToDevice);
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <> template <>
...@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray( ...@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place); auto *dst = self->mutable_data<platform::float16>(place);
paddle::platform::GpuMemcpySync(dst, array.data(),
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(), sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice);
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
} }
template <typename T> template <typename T>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册