diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index e2ec4a6f14cef59e95e023292418e240d460b16b..dcd25180e299760ce748e239bf8268341c87238c 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -52,7 +52,6 @@ void Copy(const Context& dev_ctx,
           << dst_place;
 
   dst->Resize(src.dims());
-  dst->mutable_data(dst_place);
 
   void* dst_ptr = nullptr;
   if (paddle::platform::is_cpu_place(dst_place)) {
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index a9de4a4dd6eb211a96f69c215e41ac979d85c3cf..4567e2793757c8149aded736e0a64e40de2b9802 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -16,13 +16,41 @@
 
 #include <vector>
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/stream.h"
 
 namespace phi {
 
 static constexpr size_t WAIT_THRESHOLD = 64 * 1024;
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+void MemcpyH2DKernel(const GPUContext& dev_ctx,
+                     const DenseTensor& x,
+                     int dst_place_type,
+                     DenseTensor* out) {
+  PADDLE_ENFORCE_GE(
+      dst_place_type,
+      0,
+      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
+                         dst_place_type));
+  PADDLE_ENFORCE_LE(
+      dst_place_type,
+      3,
+      errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
+                         dst_place_type));
+
+  auto stream = dev_ctx.stream();
+  out->mutable_data(dev_ctx.GetPlace(),
+                    x.dtype(),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+
+  Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+}
+#endif
+
 template <typename Context>
 void MemcpyH2DKernel(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -39,7 +67,6 @@ void MemcpyH2DKernel(const Context& dev_ctx,
       errors::OutOfRange("dst_place_type only support 0-3, but got: %d",
                          dst_place_type));
 
-  // Copy will set the stream of the tensor while setting blocking to false
   Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
 }
 
@@ -48,9 +75,12 @@ void MemcpyD2HKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      int dst_place_type,
                      DenseTensor* out) {
-  // Copy will set the stream of the tensor while setting blocking to false
   switch (dst_place_type) {
     case 0:
+      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
+      // alloc and assign DeviceContext.place to out, which causes place check
+      // fails. So we specify out's place here.
+      out->mutable_data(CPUPlace());
       Copy(dev_ctx, x, CPUPlace(), false, out);
       // NOTE(copy from Aurelius84): host <-> device memory copies of a memory
       // block of 64 KB or less are asynchronous. See
@@ -61,6 +91,10 @@ void MemcpyD2HKernel(const Context& dev_ctx,
       break;
 
     case 1:
+      // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to
+      // alloc and assign DeviceContext.place to out, which causes place check
+      // fails. So we specify out's place here.
+      out->mutable_data(GPUPinnedPlace());
       Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
       // paddle::memory::Copy use async copy for GPUPinnedPlace
       dev_ctx.Wait();
@@ -89,9 +123,9 @@ void MemcpyD2HMultiIOKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_NOT_NULL(
         array[i],
         errors::PreconditionNotMet("input tesnor %d should not be nullptr", i));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_array[i],
-        errors::PreconditionNotMet("input tesnor %d should not be nullptr", i));
+    PADDLE_ENFORCE_NOT_NULL(out_array[i],
+                            errors::PreconditionNotMet(
+                                "output tesnor %d should not be nullptr", i));
 
     const auto& x = *(array[i]);
     MemcpyD2HKernel<Context>(dev_ctx, x, dst_place_type, out_array[i]);