diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 5a546ddd38bec6b731507af1b4be333bf875e033..347d48f6eb116207c422ac7d439121814df0228b 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(platform::CUDAPlace(0)) .get()); + ctx_->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); ctx_->PartialInitWithAllocator(); engine_ = new TensorRTEngine(10, 1 << 10); diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 9837d3e4fab6e0c2ca80c8d511d953f3cb9e4193..2a98727e4b662f46fa3c3e1d64aa57d28c12fcbd 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -120,6 +120,10 @@ TEST(Malloc, CUDADeviceContextMultiStream) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + ctx->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); ctx->PartialInitWithAllocator(); dev_ctx.emplace_back(std::move(ctx)); MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]); @@ -172,6 +176,10 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + ctx->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); ctx->PartialInitWithAllocator(); dev_ctx.emplace_back(std::move(ctx)); threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i], diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 27a235765227f15dd412dcd6ad55f2a24471c6da..e5ebdad1e443476baee3f511dc446c40bb59dfd9 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -292,6 +292,10 @@ class TestFeedForward { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place_) .get()); + ctx_->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); ctx_->PartialInitWithAllocator(); size_src_ = bsz_seq_ * dim_embed_; // src: [bs, seq_len, em_dim] diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index ae1df10c45f77e557829249fb5cf56d4978f016e..d05de900e5e77fdeabd22aa1b6ec1571c84480b0 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -199,6 +199,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(ncclComm_t comm, int nranks, int rank, paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + dev_ctx->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx->PartialInitWithAllocator(); std::shared_ptr compute_event( diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 1919f59f8c07f2a0a15393fe14f2055f8d0c19bf..4301ef4bcf126db60784da93a326fb08c108e68f 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -113,6 +113,10 @@ struct NCCLContext { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + ctx_->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); ctx_->PartialInitWithAllocator(); } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 5ee54b1c865287a23bd3399ebac488a5db150145..f3934c7d8713b289e5a78d9aa8bc3ce2df46ef13 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -162,6 +162,11 @@ inline void EmplaceDeviceContext( dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() .GetAllocator(p) .get()); + dev_ctx->SetPinnedAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + cuda_ctx->PartialInitWithAllocator(); dev_ctx->SetGenerator( framework::GetDefaultCUDAGenerator(p.GetDeviceId()).get()); diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 851c756b665b8c5f78818069004b9074dc170aab..08a04a9565af7fe9f058a2a05a65ab678f4bf7eb 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -39,6 +39,10 @@ TEST(Device, Init) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); device_context->PartialInitWithAllocator(); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); @@ -66,6 +70,10 @@ TEST(Device, CUDADeviceContext) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); device_context->PartialInitWithAllocator(); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 985d0ef0beb7699547527c3fda1e2e0c896e0554..7a6705e63b420b71787d2ae0b35791e47afa3cda 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -28,8 +28,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" @@ -536,7 +538,239 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self, PyObject* args, return ToPyObject(tensor); EAGER_CATCH_AND_THROW_RETURN_NULL } +#if defined(PADDLE_WITH_CUDA) +static PyObject* eager_api_async_read(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto& src = GetTensorFromArgs("async_read", "src", args, 0, false); + auto& dst = GetTensorFromArgs("async_read", "dst", args, 1, false); + auto& index = GetTensorFromArgs("async_read", "index", args, 2, false); + auto& buffer = GetTensorFromArgs("async_read", "buffer", args, 3, false); + auto& offset = GetTensorFromArgs("async_read", "offset", args, 4, false); + auto& count = GetTensorFromArgs("async_read", "count", args, 5, false); + PADDLE_ENFORCE_EQ( + src.is_gpu_pinned(), true, + platform::errors::InvalidArgument("Required `src` device should be " + "CUDAPinnedPlace, but received %d.", + src.inner_place())); + PADDLE_ENFORCE_EQ( + dst.is_gpu(), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPlace, but received %d.", + dst.inner_place())); + PADDLE_ENFORCE_EQ( + index.is_cpu(), true, + platform::errors::InvalidArgument( + "Required `index` device should be CPUPlace, but received %d.", + index.inner_place())); + PADDLE_ENFORCE_EQ(buffer.is_gpu_pinned(), true, + platform::errors::InvalidArgument( + "Required `buffer` device should be CUDAPinnedPlace, " + "but received %d.", + buffer.inner_place())); + PADDLE_ENFORCE_EQ( + offset.is_cpu(), true, + platform::errors::InvalidArgument( + "Required `offset` device should be CPUPlace, but received %d.", + offset.inner_place())); + PADDLE_ENFORCE_EQ( + count.is_cpu(), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d.", + count.inner_place())); + + auto& src_tensor = src; + auto* dst_tensor = &dst; + auto& index_tensor = index; + auto* buffer_tensor = &buffer; + auto& offset_tensor = offset; + auto& count_tensor = count; + auto* dst_data = dst_tensor->mutable_data(dst.place()); + const auto& deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), buffer_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `buffer` should have same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ(src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], buffer_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `buffer` should have the same tensor shape, " + "except for the first dimension.")); + } + PADDLE_ENFORCE_EQ(index_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`index` tensor should be one-dimensional.")); + + auto stream = + paddle::platform::stream::get_current_stream(deviceId)->raw_stream(); + + int64_t numel = 0; // total copy length + int64_t copy_flag = offset_tensor.dims()[0]; + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + + if (copy_flag != 0) { + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + auto* offset_data = offset_tensor.data(); + auto* count_data = count_tensor.data(); + for (int64_t i = 0; i < count_tensor.numel(); i++) { + numel += count_data[i]; + } + PADDLE_ENFORCE_LE( + numel + index_tensor.numel(), buffer_tensor->dims()[0], + platform::errors::InvalidArgument("Buffer tensor size is too small.")); + PADDLE_ENFORCE_LE( + numel + index_tensor.numel(), dst_tensor->dims()[0], + platform::errors::InvalidArgument("Target tensor size is too small.")); + + int64_t src_offset, dst_offset = 0, c; + auto* src_data = src_tensor.data(); + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + src_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE( + src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument("Invalid offset or count index.")); + PADDLE_ENFORCE_LE( + dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument("Invalid offset or count index.")); + cudaMemcpyAsync(dst_data + (dst_offset * size), + src_data + (src_offset * size), c * size * sizeof(float), + cudaMemcpyHostToDevice, stream); + dst_offset += c; + } + } else { + PADDLE_ENFORCE_LE( + index_tensor.numel(), buffer_tensor->dims()[0], + platform::errors::InvalidArgument("Buffer tensor size is too small.")); + } + + // Select the index data to the buffer + auto index_select = [](const paddle::experimental::Tensor& src_tensor, + const paddle::experimental::Tensor& index_tensor, + paddle::experimental::Tensor* buffer_tensor) { + auto* src_data = src_tensor.data(); + auto* index_data = index_tensor.data(); + auto* buffer_data = buffer_tensor->data(); + const int& slice_size = src_tensor.numel() / src_tensor.dims()[0]; + const int& copy_bytes = slice_size * sizeof(float); + int64_t c = 0; + for (int64_t i = 0; i < index_tensor.numel(); i++) { + std::memcpy(buffer_data + c * slice_size, + src_data + index_data[i] * slice_size, copy_bytes); + c += 1; + } + }; + index_select(src_tensor, index_tensor, buffer_tensor); + + // Copy the data to device memory + cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data(), + index_tensor.numel() * size * sizeof(float), + cudaMemcpyHostToDevice, stream); + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_async_write(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto& src = GetTensorFromArgs("async_write", "src", args, 0, false); + auto& dst = GetTensorFromArgs("async_write", "dst", args, 1, false); + auto& offset = GetTensorFromArgs("async_write", "offset", args, 2, false); + auto& count = GetTensorFromArgs("async_write", "count", args, 3, false); + PADDLE_ENFORCE_EQ( + src.is_gpu(), true, + platform::errors::InvalidArgument( + "Required `src` device should be CUDAPlace, but received %d. ", + src.inner_place())); + PADDLE_ENFORCE_EQ(dst.is_gpu_pinned(), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPinnedPlace, " + "but received %d. ", + dst.inner_place())); + PADDLE_ENFORCE_EQ( + offset.is_cpu(), true, + platform::errors::InvalidArgument("Required `offset` device should " + "be CPUPlace, but received %d. ", + offset.inner_place())); + PADDLE_ENFORCE_EQ( + count.is_cpu(), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d. ", + count.inner_place())); + + // TODO(daisiming): In future, add index as arguments following + // async_read. + auto& src_tensor = src; + auto* dst_tensor = &dst; + auto& offset_tensor = offset; + auto& count_tensor = count; + const auto& deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ(src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + } + auto stream = + paddle::platform::stream::get_current_stream(deviceId)->raw_stream(); + + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + auto* src_data = src_tensor.data(); + auto* dst_data = dst_tensor->data(); + const int64_t* offset_data = offset_tensor.data(); + const int64_t* count_data = count_tensor.data(); + int64_t src_offset = 0, dst_offset, c; + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + dst_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE( + src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument("Invalid offset or count index")); + PADDLE_ENFORCE_LE( + dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument("Invalid offset or count index")); + cudaMemcpyAsync(dst_data + (dst_offset * size), + src_data + (src_offset * size), c * size * sizeof(float), + cudaMemcpyDeviceToHost, stream); + src_offset += c; + } + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} +#endif PyMethodDef variable_functions[] = { // TODO(jiabin): Remove scale when we have final state tests {"scale", (PyCFunction)(void (*)(void))eager_api_scale, @@ -560,6 +794,12 @@ PyMethodDef variable_functions[] = { {"sparse_csr_tensor", (PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, +#if defined(PADDLE_WITH_CUDA) + {"async_read", (PyCFunction)(void (*)(void))eager_api_async_read, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"async_write", (PyCFunction)(void (*)(void))eager_api_async_write, + METH_VARARGS | METH_KEYWORDS, NULL}, +#endif /**sparse functions**/ {NULL, NULL, 0, NULL}}; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 96d569d47c45a1bf13a193461fbc9ae91e041192..982aa52913d630fa97400294db38a2423792c48a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2007,6 +2007,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); context->PartialInitWithAllocator(); return context; #endif diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 6b486196a4b8a0bd9785e13fa05b7010e4494243..0f5f22b5bd1f47f4a27bd769bd24e968aea5581b 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -49,6 +49,14 @@ struct DeviceContext::Impl { zero_allocator_ = allocator; } + void SetPinnedAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + phi::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + pinned_allocator_ = allocator; + } + const Allocator& GetAllocator() const { PADDLE_ENFORCE_NOT_NULL( device_allocator_, @@ -68,15 +76,24 @@ struct DeviceContext::Impl { const Allocator& GetZeroAllocator() const { PADDLE_ENFORCE_NOT_NULL( zero_allocator_, - phi::errors::InvalidArgument("Required host_allocator_ shall not be " + phi::errors::InvalidArgument("Required zero_allocator_ shall not be " "nullptr, but received nullptr.")); return *zero_allocator_; } + const Allocator& GetPinnedAllocator() const { + PADDLE_ENFORCE_NOT_NULL( + pinned_allocator_, + phi::errors::InvalidArgument("Required pinned_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *pinned_allocator_; + } + void* Alloc(TensorBase* tensor, const Place& place, DataType dtype = DataType::UNDEFINED, - size_t requested_size = 0) const { + size_t requested_size = 0, + bool pinned = false) const { PADDLE_ENFORCE_NOT_NULL( tensor, phi::errors::InvalidArgument( @@ -90,8 +107,9 @@ struct DeviceContext::Impl { if (tensor->initialized() && tensor->place() != place) { ClearHolder(tensor); } - auto* allocator = - tensor->numel() == 0 ? zero_allocator_ : device_allocator_; + auto* allocator = tensor->numel() == 0 + ? zero_allocator_ + : (pinned ? pinned_allocator_ : device_allocator_); return tensor->AllocateFrom( const_cast(allocator), dtype, requested_size); } @@ -99,9 +117,10 @@ struct DeviceContext::Impl { template T* Alloc(TensorBase* tensor, const Place& place, - size_t requested_size = 0) const { + size_t requested_size = 0, + bool pinned = false) const { DataType dtype = paddle::experimental::CppTypeToDataType::Type(); - return static_cast(Alloc(tensor, place, dtype, requested_size)); + return static_cast(Alloc(tensor, place, dtype, requested_size, pinned)); } void* HostAlloc(TensorBase* tensor, @@ -179,6 +198,7 @@ struct DeviceContext::Impl { const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; + const Allocator* pinned_allocator_{nullptr}; Generator* device_generator_{nullptr}; Generator* host_generator_{nullptr}; }; @@ -189,6 +209,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); + impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetGenerator(other.GetGenerator()); } @@ -225,15 +246,25 @@ const Allocator& DeviceContext::GetZeroAllocator() const { return impl_->GetZeroAllocator(); } +void DeviceContext::SetPinnedAllocator(const Allocator* allocator) { + impl_->SetPinnedAllocator(allocator); +} +const Allocator& DeviceContext::GetPinnedAllocator() const { + return impl_->GetPinnedAllocator(); +} + void* DeviceContext::Alloc(TensorBase* tensor, DataType dtype, - size_t requested_size) const { - return impl_->Alloc(tensor, GetPlace(), dtype, requested_size); + size_t requested_size, + bool pinned) const { + return impl_->Alloc(tensor, GetPlace(), dtype, requested_size, pinned); } template -T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const { - return impl_->Alloc(tensor, GetPlace(), requested_size); +T* DeviceContext::Alloc(TensorBase* tensor, + size_t requested_size, + bool pinned) const { + return impl_->Alloc(tensor, GetPlace(), requested_size, pinned); } void* DeviceContext::HostAlloc(TensorBase* tensor, @@ -248,8 +279,8 @@ T* DeviceContext::HostAlloc(TensorBase* tensor, size_t requested_size) const { } #define DEVICE_CONTEXT_MEMBER_FUNC_INSTANTIATION(dtype) \ - template dtype* DeviceContext::Alloc(TensorBase* tensor, \ - size_t requested_size) const; \ + template dtype* DeviceContext::Alloc( \ + TensorBase* tensor, size_t requested_size, bool pinned) const; \ template dtype* DeviceContext::HostAlloc(TensorBase* tensor, \ size_t requested_size) const; diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 689f4e4e66d15f60aec873a9e9b9c07797833487..106d5ff7ddf9855f1787428f7e71e8fcb09ee49e 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -80,6 +80,13 @@ class DeviceContext { */ void SetZeroAllocator(const Allocator*); + /** + * @brief Set the zero-size Allocator object. + * + * @param allocator + */ + void SetPinnedAllocator(const Allocator*); + /** * @brief Get the const Allocator object. * @@ -96,13 +103,20 @@ class DeviceContext { const Allocator& GetZeroAllocator() const; + const Allocator& GetPinnedAllocator() const; + /** * @brief Allocate device memory for tensor. */ - void* Alloc(TensorBase*, DataType dtype, size_t requested_size = 0) const; + void* Alloc(TensorBase*, + DataType dtype, + size_t requested_size = 0, + bool pinned = false) const; template - T* Alloc(TensorBase* tensor, size_t requested_size = 0) const; + T* Alloc(TensorBase* tensor, + size_t requested_size = 0, + bool pinned = false) const; /** * @brief Allocate host memory for tensor. diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu index 28dc6f196d1681a812d7247e74678322d8a13cad..16eff5b26e38a06950166079dc46d77c5a61a57c 100644 --- a/paddle/phi/kernels/gpu/copy_kernel.cu +++ b/paddle/phi/kernels/gpu/copy_kernel.cu @@ -48,7 +48,8 @@ void Copy(const Context& dev_ctx, // dev_ctx can not alloc pinned memory now dst_ptr = dst->mutable_data(dst_place, src.dtype()); } else { - dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + dst_ptr = dev_ctx.Alloc( + dst, src.dtype(), 0, paddle::platform::is_cuda_pinned_place(dst_place)); } if (src_ptr == dst_ptr && src_place == dst_place) { @@ -151,6 +152,30 @@ void Copy(const Context& dev_ctx, "Context place dose not match the source and destination place.")); } } + } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT + paddle::platform::is_cuda_pinned_place(dst_place)) { + auto src_gpu_place = src_place; + auto dst_cuda_pinned_place = dst_place; + auto ctx_place = dev_ctx.GetPlace(); + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx_place), + true, + phi::errors::PreconditionNotMet( + "Context place error, excepted GPUPlace, but actually %s.", + ctx_place)); + auto ctx_gpu_place = ctx_place; + PADDLE_ENFORCE_EQ(src_gpu_place, + ctx_gpu_place, + phi::errors::Unavailable( + "Source place and context place do not match, source " + "place is %s, context place is %s.", + src_gpu_place, + ctx_gpu_place)); + auto stream = + blocking ? nullptr + : reinterpret_cast(dev_ctx).stream(); + paddle::memory::Copy( + dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { PADDLE_THROW(phi::errors::InvalidArgument( "Place type error. Please check the place of src and dst Tensor.")); diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 5e6b097ad367bffad064c71428860047f99d9b9a..33f84db76e78eec6710abea3a93a06f1eaa55408 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -160,6 +160,10 @@ void TestConv3dBase(const std::vector& indices, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); DenseTensor d_indices_tensor = phi::Empty( diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 80b3392a611b03df841abc3418f65b2337895ae6..632beadf3de0edd3bc95b23a211fbc0addefa585 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -134,6 +134,10 @@ void TestMaxPoolBase(const std::vector& indices, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); DenseTensor d_indices_tensor = phi::Empty( diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc index b8f214b79e290c2e102fc2c08dab2ddc6a61dd71..93728ad31b0d62dd400d4e81e8230ab168a3dc4d 100644 --- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc @@ -117,6 +117,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); const auto cuda_alloc = @@ -328,6 +332,10 @@ void TestSparseCsrToCoo(const DDim& dense_dims, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); const auto cuda_alloc = @@ -511,6 +519,10 @@ void TestCooToCsr(const DDim& dense_dims, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); @@ -611,6 +623,10 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x); auto sparse_out = sparse::DenseToSparseCsr(dev_ctx_gpu, d_dense_x); @@ -741,6 +757,10 @@ void TestSparseCooToDense(const DDim& dense_dims, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta()); DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta()); @@ -886,6 +906,10 @@ void TestSparseCsrToDense(const DDim& dense_dims, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); + dev_ctx_gpu.SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); dev_ctx_gpu.PartialInitWithAllocator(); phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index d67edf3eb1fdfdab7409430384400333ddcb55d9..f4871ba64e571af8e971810f9c634df6010883b8 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -836,6 +836,16 @@ def monkey_patch_varbase(): res.persistable = self.persistable return res + @framework.dygraph_only + def pin_memory(self): + if self.place.is_cuda_pinned_place(): + return self + else: + res = self._copy_to(core.CUDAPinnedPlace(), True) + res.stop_gradient = self.stop_gradient + res.persistable = self.persistable + return res + if framework._in_eager_mode_ and not hasattr(core, "eager"): return @@ -861,6 +871,7 @@ def monkey_patch_varbase(): setattr(core.eager.Tensor, "value", value) setattr(core.eager.Tensor, "cpu", cpu) setattr(core.eager.Tensor, "cuda", cuda) + setattr(core.eager.Tensor, "pin_memory", pin_memory) setattr(core.eager.Tensor, "_slice", _slice) setattr(core.eager.Tensor, "_numel", _numel) else: diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py index 91875b446aba4d6c67f84539da81f96bfac0a40a..babdf43199dd6225deb06497423029e8bb3a2ec3 100644 --- a/python/paddle/tests/test_async_read_write.py +++ b/python/paddle/tests/test_async_read_write.py @@ -18,10 +18,11 @@ import numpy as np import paddle from paddle.fluid import core from paddle.device import cuda +from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph class TestAsyncRead(unittest.TestCase): - def setUp(self): + def func_setUp(self): self.empty = paddle.to_tensor( np.array( [], dtype="int64"), place=paddle.CPUPlace()) @@ -35,16 +36,20 @@ class TestAsyncRead(unittest.TestCase): shape=[50, 50, 50], dtype="float32").pin_memory() self.stream = cuda.Stream() - def test_async_read_empty_offset_and_count(self): + def func_test_async_read_empty_offset_and_count(self): with cuda.stream_guard(self.stream): - core.async_read(self.src, self.dst, self.index, self.buffer, - self.empty, self.empty) + if _in_legacy_dygraph(): + core.async_read(self.src, self.dst, self.index, self.buffer, + self.empty, self.empty) + else: + core.eager.async_read(self.src, self.dst, self.index, + self.buffer, self.empty, self.empty) array1 = paddle.gather(self.src, self.index) array2 = self.dst[:len(self.index)] self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) - def test_async_read_success(self): + def func_test_async_read_success(self): offset = paddle.to_tensor( np.array( [10, 20], dtype="int64"), place=paddle.CPUPlace()) @@ -52,9 +57,12 @@ class TestAsyncRead(unittest.TestCase): np.array( [5, 10], dtype="int64"), place=paddle.CPUPlace()) with cuda.stream_guard(self.stream): - core.async_read(self.src, self.dst, self.index, self.buffer, offset, - count) - + if _in_legacy_dygraph(): + core.async_read(self.src, self.dst, self.index, self.buffer, + offset, count) + else: + core.eager.async_read(self.src, self.dst, self.index, + self.buffer, offset, count) # index data index_array1 = paddle.gather(self.src, self.index) count_numel = paddle.sum(count).numpy()[0] @@ -69,26 +77,43 @@ class TestAsyncRead(unittest.TestCase): self.assertTrue( np.allclose(offset_array1.numpy(), offset_array2.numpy())) - def test_async_read_only_1dim(self): + def func_test_async_read_only_1dim(self): src = paddle.rand([40], dtype="float32").pin_memory() dst = paddle.empty([40], dtype="float32") buffer_ = paddle.empty([20]).pin_memory() with cuda.stream_guard(self.stream): - core.async_read(src, dst, self.index, buffer_, self.empty, - self.empty) + if _in_legacy_dygraph(): + core.async_read(src, dst, self.index, buffer_, self.empty, + self.empty) + else: + core.eager.async_read(src, dst, self.index, buffer_, self.empty, + self.empty) array1 = paddle.gather(src, self.index) array2 = dst[:len(self.index)] self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) + def test_main(self): + with _test_eager_guard(): + self.func_setUp() + self.func_test_async_read_empty_offset_and_count() + self.func_test_async_read_success() + self.func_test_async_read_only_1dim() + self.func_setUp() + self.func_test_async_read_empty_offset_and_count() + self.func_setUp() + self.func_test_async_read_success() + self.func_setUp() + self.func_test_async_read_only_1dim() + class TestAsyncWrite(unittest.TestCase): - def setUp(self): + def func_setUp(self): self.src = paddle.rand(shape=[100, 50, 50, 5], dtype="float32") self.dst = paddle.empty( shape=[200, 50, 50, 5], dtype="float32").pin_memory() self.stream = cuda.Stream() - def test_async_write_success(self): + def func_test_async_write_success(self): offset = paddle.to_tensor( np.array( [0, 60], dtype="int64"), place=paddle.CPUPlace()) @@ -96,13 +121,23 @@ class TestAsyncWrite(unittest.TestCase): np.array( [40, 60], dtype="int64"), place=paddle.CPUPlace()) with cuda.stream_guard(self.stream): - core.async_write(self.src, self.dst, offset, count) + if _in_legacy_dygraph(): + core.async_write(self.src, self.dst, offset, count) + else: + core.eager.async_write(self.src, self.dst, offset, count) offset_a = paddle.gather(self.dst, paddle.to_tensor(np.arange(0, 40))) offset_b = paddle.gather(self.dst, paddle.to_tensor(np.arange(60, 120))) offset_array = paddle.concat([offset_a, offset_b], axis=0) self.assertTrue(np.allclose(self.src.numpy(), offset_array.numpy())) + def test_async_write_success(self): + with _test_eager_guard(): + self.func_setUp() + self.func_test_async_write_success() + self.func_setUp() + self.func_test_async_write_success() + if __name__ == "__main__": if core.is_compiled_with_cuda():