diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 260da509a17cbc3cf5fb939ad320545e65f4a3bf..b06ed3578df7224119b859695a7c45d76f0dee65 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -215,7 +215,11 @@ struct ConcatTensorsForAllReduce<platform::CustomDeviceContext, T> {
       const uint8_t *in_data =
           reinterpret_cast<const uint8_t *>(tensor.data<T>());
       auto sz = tensor.numel() * sizeof(T);
-      device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream);
+      if (tensor.place().GetType() == phi::AllocationType::CPU) {
+        device->MemoryCopyH2D(out_data + offset, in_data, sz, &stream);
+      } else {
+        device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream);
+      }
       offset += sz;
     }
   }
@@ -237,7 +241,11 @@ struct SplitTensorsForAllReduce<platform::CustomDeviceContext, T> {
     for (auto &tensor : *p_dense_tensors) {
       uint8_t *out_data = reinterpret_cast<uint8_t *>(tensor.data<T>());
       auto sz = tensor.numel() * sizeof(T);
-      device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream);
+      if (tensor.place().GetType() == phi::AllocationType::CPU) {
+        device->MemoryCopyD2H(out_data, in_data + offset, sz, &stream);
+      } else {
+        device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream);
+      }
       offset += sz;
     }
   }