未验证 提交 01da2584 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #10202 from reyoung/feature/clean_memcpy_async

Clean memcpy async
......@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace()));
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
pool.Get(tensor.place()));
paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream());
dev_ctx->Wait();
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
......@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice);
}
template <>
......@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
paddle::platform::GpuMemcpySync(dst, array.data(),
sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
// NOTE: For safety, here wait the copy complete.
// It because the CPU array.data() could be destroyed after this method.
// If we make this method async, it could be copied data from a memory buffer
// that has been freed.
dev_ctx->Wait();
cudaMemcpyHostToDevice);
}
template <typename T>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册