[CustomDevice] fix reducer when input on cpu (#53662)

e077678c · ronnywang · GitHub · aafaad90 · e077678c
隐藏空白更改
内联并排

Showing with 10 addition and 2 deletion

paddle/fluid/distributed/collective/reducer.cc paddle/fluid/distributed/collective/reducer.cc +10 -2

未找到文件。
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -215,7 +215,11 @@ struct ConcatTensorsForAllReduce<platform::CustomDeviceContext, T> {
      const uint8_t *in_data =
          reinterpret_cast<const uint8_t *>(tensor.data<T>());
      auto sz = tensor.numel() * sizeof(T);
-      device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream);
+      if (tensor.place().GetType() == phi::AllocationType::CPU) {
+        device->MemoryCopyH2D(out_data + offset, in_data, sz, &stream);
+      } else {
+        device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream);
+      }
      offset += sz;
    }
  }
@@ -237,7 +241,11 @@ struct SplitTensorsForAllReduce<platform::CustomDeviceContext, T> {
    for (auto &tensor : *p_dense_tensors) {
      uint8_t *out_data = reinterpret_cast<uint8_t *>(tensor.data<T>());
      auto sz = tensor.numel() * sizeof(T);
-      device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream);
+      if (tensor.place().GetType() == phi::AllocationType::CPU) {
+        device->MemoryCopyD2H(out_data, in_data + offset, sz, &stream);
+      } else {
+        device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream);
+      }
      offset += sz;
    }
  }