diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 07c43554bc6a0d71d688a5a5772d0ab3d2de319a..e6ee598db04dd9e0075b39a50d1d4e878d73086d 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -58,12 +58,13 @@ void GetTensorPayload(framework::Variable* var, if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); - platform::CPUPlace cpu; + platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - *payload = memory::Alloc(cpu, copy_size); + *payload = memory::Alloc(cuda_pinned, copy_size); - memory::Copy(cpu, *payload, boost::get(tensor.place()), + memory::Copy(cuda_pinned, *payload, + boost::get(tensor.place()), reinterpret_cast(tensor.data()), copy_size, gpu_dev_ctx.stream()); ctx.Wait(); @@ -90,11 +91,11 @@ void GetSelectedRowsPayload(framework::Variable* var, auto* tensor = slr->mutable_value(); if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA - platform::CPUPlace cpu; + platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type()); - *payload = memory::Alloc(cpu, copy_size); - memory::Copy(cpu, *payload, + *payload = memory::Alloc(cuda_pinned, copy_size); + memory::Copy(cuda_pinned, *payload, boost::get(tensor->place()), reinterpret_cast(tensor->data()), copy_size, gpu_dev_ctx.stream()); @@ -145,8 +146,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, // GPU data is copied to CPU buffer when sending, // free the buffer when possible. destroy_callback = [](void* backing) { - platform::CPUPlace cpu; - memory::Free(cpu, backing); + platform::CUDAPinnedPlace cuda_pinned; + memory::Free(cuda_pinned, backing); }; }