diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2f2e0ff9f725937a7ddfd7d5a4c1827cb03626b5..c1ca6d8e9608cafcca01be3421d6086b87432046 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() {
               memory::allocation::AllocatorFacade::Instance()
                   .GetZeroAllocator(place_)
                   .get());
+          gpu_context->SetHostZeroAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetZeroAllocator(platform::CPUPlace())
+                  .get());
           gpu_context->SetGenerator(
               framework::DefaultCUDAGenerator(place_.GetDeviceId()).get());
           gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 027c593d73c6fc3ea7ab545d89f4db44c3e71974..9a06b2e65ef100a4903d732d3e02058197f2a972 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetZeroAllocator(platform::CUDAPlace(0))
             .get());
+    ctx_->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
     ctx_->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 0bf5e99b773b2b53364d4f498a4bee4bf4f14915..9a8ab9324f1c29bff2774811c7b212694e659402 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetZeroAllocator(place)
             .get());
+    ctx->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
     ctx->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 67ccac77e4e03032712402222043d601d89c99a2..41cb9ed1b700d2f0ac13e90b29878ba819736227 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetZeroAllocator(CUDAPlace(dev_id))
           .get());
+  dev_ctx->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx->SetPinnedAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetZeroAllocator(XPUPlace(dev_id))
           .get());
+  dev_ctx->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
 
   BKCLCommImpl* c = new BKCLCommImpl;
   c->set_ring_id(ring_id);
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 5d89da86efa6cf5e3d6d7bb27bc5698400abc31a..f17ad3749fac5b344edaa2dc0aa62b747f1415ba 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -136,6 +136,10 @@ struct NCCLContext {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetZeroAllocator(CUDAPlace(dev_id))
             .get());
+    ctx_->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
     ctx_->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f0b1efc769430a033e05066a989893759f372ae3..cafb7e1da0f82b58a2cf2ff5941b32bcb48f62ef 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -183,6 +183,9 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
   dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
                                 .GetZeroAllocator(p)
                                 .get());
+  dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
+                                    .GetZeroAllocator(platform::CPUPlace())
+                                    .get());
   return PtrType(dev_ctx);
 }
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index abffa1e8846df40d2b9ef0fc0a5d6d2a781c87e6..c4b998f660f3537efbc107b0c258132e60fedd23 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 TEST(Device, Init) {
   using paddle::platform::CUDAPlace;
@@ -38,6 +39,10 @@ TEST(Device, Init) {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetZeroAllocator(CUDAPlace(i))
             .get());
+    device_context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
     device_context->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -69,6 +74,10 @@ TEST(Device, GPUContext) {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetZeroAllocator(CUDAPlace(i))
             .get());
+    device_context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(paddle::platform::CPUPlace())
+            .get());
     device_context->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CUDAPinnedPlace())
@@ -92,6 +101,45 @@ TEST(Device, GPUContext) {
   }
 }
 
+TEST(Device, HostZeroAllocator) {
+  using paddle::platform::CUDAPlace;
+
+  auto device_context = std::make_unique<phi::GPUContext>(CUDAPlace(0));
+  device_context->SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(CUDAPlace(0), device_context->stream())
+          .get());
+  device_context->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_context->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(CUDAPlace(0))
+          .get());
+  device_context->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
+  device_context->SetPinnedAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CUDAPinnedPlace())
+          .get());
+  device_context->PartialInitWithAllocator();
+
+  phi::DenseTensor tensor;
+  tensor.Resize({0});
+  device_context->HostAlloc<float>(&tensor);
+  ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
+  ASSERT_EQ(tensor.numel(), 0);
+  ASSERT_EQ(tensor.dtype(), phi::DataType::FLOAT32);
+
+  phi::GPUContext gpu_context(CUDAPlace(0));
+  gpu_context.SetHostZeroAllocator(&device_context->GetHostZeroAllocator());
+  gpu_context.HostAlloc<float>(&tensor);
+  ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU);
+}
+
 TEST(Device, DeviceContextPool) {
   using paddle::platform::CPUPlace;
   using paddle::platform::CUDAPlace;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b4d175efd2b5699f71342a1d1e333b5340b1cff9..32bfeb8b1c3432ba979081b35c1541bb4d7d15df 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle.
                         paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetZeroAllocator(place)
                             .get());
+                    context->SetHostZeroAllocator(
+                        paddle::memory::allocation::AllocatorFacade::Instance()
+                            .GetZeroAllocator(paddle::platform::CPUPlace())
+                            .get());
                     return context;
                   })
       .def_static(
@@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle.
         paddle::memory::allocation::AllocatorFacade::Instance()
           .GetZeroAllocator(place)
           .get());
+      context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(paddle::platform::CPUPlace())
+          .get());
       return context;
 #endif
           })
@@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle.
         paddle::memory::allocation::AllocatorFacade::Instance()
         .GetZeroAllocator(place)
         .get());
+      context->SetHostZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+        .GetZeroAllocator(paddle::platform::CPUPlace())
+        .get());
       context->SetPinnedAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CUDAPinnedPlace())
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index dd3a30ed2992e9ba7d3e5cfcda25e21bd64ac5de..d46f9250eeb4c90a0dbc2352e34face68ad06513 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -54,6 +54,14 @@ struct DeviceContext::Impl {
     zero_allocator_ = allocator;
   }
 
+  void SetHostZeroAllocator(const Allocator* allocator) {
+    PADDLE_ENFORCE_NOT_NULL(
+        allocator,
+        phi::errors::InvalidArgument(
+            "Required allocator shall not be nullptr, but received nullptr."));
+    host_zero_allocator_ = allocator;
+  }
+
   void SetPinnedAllocator(const Allocator* allocator) {
     PADDLE_ENFORCE_NOT_NULL(
         allocator,
@@ -106,6 +114,14 @@ struct DeviceContext::Impl {
     return *zero_allocator_;
   }
 
+  const Allocator& GetHostZeroAllocator() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        host_zero_allocator_,
+        phi::errors::InvalidArgument("Required zero_allocator_ shall not be "
+                                     "nullptr, but received nullptr."));
+    return *host_zero_allocator_;
+  }
+
   const Allocator& GetPinnedAllocator() const {
     PADDLE_ENFORCE_NOT_NULL(
         pinned_allocator_,
@@ -172,7 +188,8 @@ struct DeviceContext::Impl {
     if (tensor->initialized() && tensor->place() != CPUPlace()) {
       ClearHolder(tensor);
     }
-    auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_;
+    auto* allocator =
+        tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_;
     return tensor->AllocateFrom(
         const_cast<Allocator*>(allocator), dtype, requested_size);
   }
@@ -234,6 +251,7 @@ struct DeviceContext::Impl {
   const Allocator* device_allocator_{nullptr};
   const Allocator* host_allocator_{nullptr};
   const Allocator* zero_allocator_{nullptr};
+  const Allocator* host_zero_allocator_{nullptr};
   const Allocator* pinned_allocator_{nullptr};
 #ifdef PADDLE_WITH_CUDA
   const Allocator* cuda_graph_allocator_{nullptr};
@@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetHostAllocator(&other.GetHostAllocator());
   impl_->SetAllocator(&other.GetAllocator());
   impl_->SetZeroAllocator(&other.GetZeroAllocator());
+  impl_->SetHostZeroAllocator(&other.GetHostZeroAllocator());
   impl_->SetPinnedAllocator(&other.GetPinnedAllocator());
   impl_->SetHostGenerator(other.GetHostGenerator());
   impl_->SetGenerator(other.GetGenerator());
@@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) {
   impl_->SetZeroAllocator(allocator);
 }
 
+void DeviceContext::SetHostZeroAllocator(const Allocator* allocator) {
+  impl_->SetHostZeroAllocator(allocator);
+}
+
 const Allocator& DeviceContext::GetZeroAllocator() const {
   return impl_->GetZeroAllocator();
 }
 
+const Allocator& DeviceContext::GetHostZeroAllocator() const {
+  return impl_->GetHostZeroAllocator();
+}
+
 void DeviceContext::SetPinnedAllocator(const Allocator* allocator) {
   impl_->SetPinnedAllocator(allocator);
 }
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index 5dad261f43b34341f0daabb20c85141220a1d13c..9114490d1a70efbaba741d116eda06da49aa8315 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -82,6 +82,13 @@ class PADDLE_API DeviceContext {
    */
   void SetZeroAllocator(const Allocator*);
 
+  /**
+   * @brief Set the zero-size host Allocator object.
+   *
+   * @param allocator
+   */
+  void SetHostZeroAllocator(const Allocator*);
+
   /**
    * @brief Set the zero-size Allocator object.
    *
@@ -105,6 +112,8 @@ class PADDLE_API DeviceContext {
 
   const Allocator& GetZeroAllocator() const;
 
+  const Allocator& GetHostZeroAllocator() const;
+
   const Allocator& GetPinnedAllocator() const;
 
 #ifdef PADDLE_WITH_CUDA