diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2f2e0ff9f725937a7ddfd7d5a4c1827cb03626b5..c1ca6d8e9608cafcca01be3421d6086b87432046 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() { memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place_) .get()); + gpu_context->SetHostZeroAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); gpu_context->SetGenerator( framework::DefaultCUDAGenerator(place_.GetDeviceId()).get()); gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get()); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 027c593d73c6fc3ea7ab545d89f4db44c3e71974..9a06b2e65ef100a4903d732d3e02058197f2a972 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(platform::CUDAPlace(0)) .get()); + ctx_->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx_->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 0bf5e99b773b2b53364d4f498a4bee4bf4f14915..9a8ab9324f1c29bff2774811c7b212694e659402 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 67ccac77e4e03032712402222043d601d89c99a2..41cb9ed1b700d2f0ac13e90b29878ba819736227 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm( paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + dev_ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm( paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(XPUPlace(dev_id)) .get()); + dev_ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); BKCLCommImpl* c = new BKCLCommImpl; c->set_ring_id(ring_id); diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 5d89da86efa6cf5e3d6d7bb27bc5698400abc31a..f17ad3749fac5b344edaa2dc0aa62b747f1415ba 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -136,6 +136,10 @@ struct NCCLContext { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + ctx_->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx_->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f0b1efc769430a033e05066a989893759f372ae3..cafb7e1da0f82b58a2cf2ff5941b32bcb48f62ef 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -183,6 +183,9 @@ std::unique_ptr CreateDeviceContext( dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(p) .get()); + dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); return PtrType(dev_ctx); } diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index abffa1e8846df40d2b9ef0fc0a5d6d2a781c87e6..c4b998f660f3537efbc107b0c258132e60fedd23 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/dense_tensor.h" TEST(Device, Init) { using paddle::platform::CUDAPlace; @@ -38,6 +39,10 @@ TEST(Device, Init) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); device_context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -69,6 +74,10 @@ TEST(Device, GPUContext) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); device_context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -92,6 +101,45 @@ TEST(Device, GPUContext) { } } +TEST(Device, HostZeroAllocator) { + using paddle::platform::CUDAPlace; + + auto device_context = std::make_unique(CUDAPlace(0)); + device_context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(0), device_context->stream()) + .get()); + device_context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(0)) + .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + device_context->PartialInitWithAllocator(); + + phi::DenseTensor tensor; + tensor.Resize({0}); + device_context->HostAlloc(&tensor); + ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU); + ASSERT_EQ(tensor.numel(), 0); + ASSERT_EQ(tensor.dtype(), phi::DataType::FLOAT32); + + phi::GPUContext gpu_context(CUDAPlace(0)); + gpu_context.SetHostZeroAllocator(&device_context->GetHostZeroAllocator()); + gpu_context.HostAlloc(&tensor); + ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU); +} + TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b4d175efd2b5699f71342a1d1e333b5340b1cff9..32bfeb8b1c3432ba979081b35c1541bb4d7d15df 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); return context; }) .def_static( @@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); return context; #endif }) @@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index dd3a30ed2992e9ba7d3e5cfcda25e21bd64ac5de..d46f9250eeb4c90a0dbc2352e34face68ad06513 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -54,6 +54,14 @@ struct DeviceContext::Impl { zero_allocator_ = allocator; } + void SetHostZeroAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + phi::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + host_zero_allocator_ = allocator; + } + void SetPinnedAllocator(const Allocator* allocator) { PADDLE_ENFORCE_NOT_NULL( allocator, @@ -106,6 +114,14 @@ struct DeviceContext::Impl { return *zero_allocator_; } + const Allocator& GetHostZeroAllocator() const { + PADDLE_ENFORCE_NOT_NULL( + host_zero_allocator_, + phi::errors::InvalidArgument("Required zero_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *host_zero_allocator_; + } + const Allocator& GetPinnedAllocator() const { PADDLE_ENFORCE_NOT_NULL( pinned_allocator_, @@ -172,7 +188,8 @@ struct DeviceContext::Impl { if (tensor->initialized() && tensor->place() != CPUPlace()) { ClearHolder(tensor); } - auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; + auto* allocator = + tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_; return tensor->AllocateFrom( const_cast(allocator), dtype, requested_size); } @@ -234,6 +251,7 @@ struct DeviceContext::Impl { const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; + const Allocator* host_zero_allocator_{nullptr}; const Allocator* pinned_allocator_{nullptr}; #ifdef PADDLE_WITH_CUDA const Allocator* cuda_graph_allocator_{nullptr}; @@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); + impl_->SetHostZeroAllocator(&other.GetHostZeroAllocator()); impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetGenerator(other.GetGenerator()); @@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) { impl_->SetZeroAllocator(allocator); } +void DeviceContext::SetHostZeroAllocator(const Allocator* allocator) { + impl_->SetHostZeroAllocator(allocator); +} + const Allocator& DeviceContext::GetZeroAllocator() const { return impl_->GetZeroAllocator(); } +const Allocator& DeviceContext::GetHostZeroAllocator() const { + return impl_->GetHostZeroAllocator(); +} + void DeviceContext::SetPinnedAllocator(const Allocator* allocator) { impl_->SetPinnedAllocator(allocator); } diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 5dad261f43b34341f0daabb20c85141220a1d13c..9114490d1a70efbaba741d116eda06da49aa8315 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -82,6 +82,13 @@ class PADDLE_API DeviceContext { */ void SetZeroAllocator(const Allocator*); + /** + * @brief Set the zero-size host Allocator object. + * + * @param allocator + */ + void SetHostZeroAllocator(const Allocator*); + /** * @brief Set the zero-size Allocator object. * @@ -105,6 +112,8 @@ class PADDLE_API DeviceContext { const Allocator& GetZeroAllocator() const; + const Allocator& GetHostZeroAllocator() const; + const Allocator& GetPinnedAllocator() const; #ifdef PADDLE_WITH_CUDA