提交 8a49a887 编写于 作者: Y yi.wu

send use pinned memory

上级 dac0679a
...@@ -58,12 +58,13 @@ void GetTensorPayload(framework::Variable* var, ...@@ -58,12 +58,13 @@ void GetTensorPayload(framework::Variable* var,
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
platform::CPUPlace cpu; platform::CUDAPinnedPlace cuda_pinned;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx); auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
*payload = memory::Alloc(cpu, copy_size); *payload = memory::Alloc(cuda_pinned, copy_size);
memory::Copy(cpu, *payload, boost::get<platform::CUDAPlace>(tensor.place()), memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()), copy_size, reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
ctx.Wait(); ctx.Wait();
...@@ -90,11 +91,11 @@ void GetSelectedRowsPayload(framework::Variable* var, ...@@ -90,11 +91,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu; platform::CUDAPinnedPlace cuda_pinned;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx); auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type()); auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
*payload = memory::Alloc(cpu, copy_size); *payload = memory::Alloc(cuda_pinned, copy_size);
memory::Copy(cpu, *payload, memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor->place()), boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()), copy_size, reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
gpu_dev_ctx.stream()); gpu_dev_ctx.stream());
...@@ -145,8 +146,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -145,8 +146,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
// GPU data is copied to CPU buffer when sending, // GPU data is copied to CPU buffer when sending,
// free the buffer when possible. // free the buffer when possible.
destroy_callback = [](void* backing) { destroy_callback = [](void* backing) {
platform::CPUPlace cpu; platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cpu, backing); memory::Free(cuda_pinned, backing);
}; };
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册