Fix bug of zero_allocator in HostAlloc (#48108)

* fix bug of zero_allocator in host * fix test compile bug * add unittest * update test

Fix bug of zero_allocator in HostAlloc (#48108)
* fix bug of zero_allocator in host * fix test compile bug * add unittest * update test
7f92e27e · zyfncg · GitHub · 058aa381 · 7f92e27e · 7f92e27e
10 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() {
              memory::allocation::AllocatorFacade::Instance()
                  .GetZeroAllocator(place_)
                  .get());
+          gpu_context->SetHostZeroAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetZeroAllocator(platform::CPUPlace())
+                  .get());
          gpu_context->SetGenerator(
              framework::DefaultCUDAGenerator(place_.GetDeviceId()).get());
          gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetZeroAllocator(platform::CUDAPlace(0))
            .get());
+    ctx_->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
    ctx_->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CUDAPinnedPlace())

--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetZeroAllocator(place)
            .get());
+    ctx->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
    ctx->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CUDAPinnedPlace())

--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetZeroAllocator(CUDAPlace(dev_id))
          .get());
+  dev_ctx->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
  dev_ctx->SetPinnedAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetZeroAllocator(XPUPlace(dev_id))
          .get());
+  dev_ctx->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());

  BKCLCommImpl* c = new BKCLCommImpl;
  c->set_ring_id(ring_id);

--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -136,6 +136,10 @@ struct NCCLContext {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetZeroAllocator(CUDAPlace(dev_id))
            .get());
+    ctx_->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
    ctx_->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CUDAPinnedPlace())

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -183,6 +183,9 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
  dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
                                .GetZeroAllocator(p)
                                .get());
+  dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetZeroAllocator(platform::CPUPlace())
+                                    .get());
  return PtrType(dev_ctx);
 }


--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"

 TEST(Device, Init) {
  using paddle::platform::CUDAPlace;
@@ -38,6 +39,10 @@ TEST(Device, Init) {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetZeroAllocator(CUDAPlace(i))
            .get());
+    device_context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
    device_context->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -69,6 +74,10 @@ TEST(Device, GPUContext) {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetZeroAllocator(CUDAPlace(i))
            .get());
+    device_context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
    device_context->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -92,6 +101,45 @@ TEST(Device, GPUContext) {
  }
 }

+TEST(Device, HostZeroAllocator) {
+  using paddle::platform::CUDAPlace;
+
+  auto device_context = std::make_unique<phi::GPUContext>(CUDAPlace(0));
+  device_context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(CUDAPlace(0), device_context->stream())
+          .get());
+  device_context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(0))
+          .get());
+  device_context->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_context->SetPinnedAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPinnedPlace())
+          .get());
+  device_context->PartialInitWithAllocator();
+
+  phi::DenseTensor tensor;
+  tensor.Resize({0});
+  device_context->HostAlloc<float>(&tensor);
+  ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
+  ASSERT_EQ(tensor.numel(), 0);
+  ASSERT_EQ(tensor.dtype(), phi::DataType::FLOAT32);
+
+  phi::GPUContext gpu_context(CUDAPlace(0));
+  gpu_context.SetHostZeroAllocator(&device_context->GetHostZeroAllocator());
+  gpu_context.HostAlloc<float>(&tensor);
+  ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
+}
+
 TEST(Device, DeviceContextPool) {
  using paddle::platform::CPUPlace;
  using paddle::platform::CUDAPlace;

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle.
                        paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetZeroAllocator(place)
                            .get());
+                    context->SetHostZeroAllocator(
+                        paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetZeroAllocator(paddle::platform::CPUPlace())
+                            .get());
                    return context;
                  })
      .def_static(
@@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle.
        paddle::memory::allocation::AllocatorFacade::Instance()
          .GetZeroAllocator(place)
          .get());
+      context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
      return context;
 #endif
          })
@@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle.
        paddle::memory::allocation::AllocatorFacade::Instance()
        .GetZeroAllocator(place)
        .get());
+      context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(paddle::platform::CPUPlace())
+        .get());
      context->SetPinnedAllocator(
        paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CUDAPinnedPlace())

--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -54,6 +54,14 @@ struct DeviceContext::Impl {
    zero_allocator_ = allocator;
  }

+  void SetHostZeroAllocator(const Allocator* allocator) {
+    PADDLE_ENFORCE_NOT_NULL(
+        allocator,
+        phi::errors::InvalidArgument(
+            "Required allocator shall not be nullptr, but received nullptr."));
+    host_zero_allocator_ = allocator;
+  }
+
  void SetPinnedAllocator(const Allocator* allocator) {
    PADDLE_ENFORCE_NOT_NULL(
        allocator,
@@ -106,6 +114,14 @@ struct DeviceContext::Impl {
    return *zero_allocator_;
  }

+  const Allocator& GetHostZeroAllocator() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        host_zero_allocator_,
+        phi::errors::InvalidArgument("Required zero_allocator_ shall not be "
+                                     "nullptr, but received nullptr."));
+    return *host_zero_allocator_;
+  }
+
  const Allocator& GetPinnedAllocator() const {
    PADDLE_ENFORCE_NOT_NULL(
        pinned_allocator_,
@@ -172,7 +188,8 @@ struct DeviceContext::Impl {
    if (tensor->initialized() && tensor->place() != CPUPlace()) {
      ClearHolder(tensor);
    }
-    auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
+    auto* allocator =
+        tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_;
    return tensor->AllocateFrom(
        const_cast<Allocator*>(allocator), dtype, requested_size);
  }
@@ -234,6 +251,7 @@ struct DeviceContext::Impl {
  const Allocator* device_allocator_{nullptr};
  const Allocator* host_allocator_{nullptr};
  const Allocator* zero_allocator_{nullptr};
+  const Allocator* host_zero_allocator_{nullptr};
  const Allocator* pinned_allocator_{nullptr};
 #ifdef PADDLE_WITH_CUDA
  const Allocator* cuda_graph_allocator_{nullptr};
@@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
  impl_->SetHostAllocator(&other.GetHostAllocator());
  impl_->SetAllocator(&other.GetAllocator());
  impl_->SetZeroAllocator(&other.GetZeroAllocator());
+  impl_->SetHostZeroAllocator(&other.GetHostZeroAllocator());
  impl_->SetPinnedAllocator(&other.GetPinnedAllocator());
  impl_->SetHostGenerator(other.GetHostGenerator());
  impl_->SetGenerator(other.GetGenerator());
@@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) {
  impl_->SetZeroAllocator(allocator);
 }

+void DeviceContext::SetHostZeroAllocator(const Allocator* allocator) {
+  impl_->SetHostZeroAllocator(allocator);
+}
+
 const Allocator& DeviceContext::GetZeroAllocator() const {
  return impl_->GetZeroAllocator();
 }

+const Allocator& DeviceContext::GetHostZeroAllocator() const {
+  return impl_->GetHostZeroAllocator();
+}
+
 void DeviceContext::SetPinnedAllocator(const Allocator* allocator) {
  impl_->SetPinnedAllocator(allocator);
 }

--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -82,6 +82,13 @@ class PADDLE_API DeviceContext {
   */
  void SetZeroAllocator(const Allocator*);

+  /**
+   * @brief Set the zero-size host Allocator object.
+   *
+   * @param allocator
+   */
+  void SetHostZeroAllocator(const Allocator*);
+
  /**
   * @brief Set the zero-size Allocator object.
   *
@@ -105,6 +112,8 @@ class PADDLE_API DeviceContext {

  const Allocator& GetZeroAllocator() const;

+  const Allocator& GetHostZeroAllocator() const;
+
  const Allocator& GetPinnedAllocator() const;

 #ifdef PADDLE_WITH_CUDA