diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 260da509a17cbc3cf5fb939ad320545e65f4a3bf..b06ed3578df7224119b859695a7c45d76f0dee65 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -215,7 +215,11 @@ struct ConcatTensorsForAllReduce { const uint8_t *in_data = reinterpret_cast(tensor.data()); auto sz = tensor.numel() * sizeof(T); - device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream); + if (tensor.place().GetType() == phi::AllocationType::CPU) { + device->MemoryCopyH2D(out_data + offset, in_data, sz, &stream); + } else { + device->MemoryCopyD2D(out_data + offset, in_data, sz, &stream); + } offset += sz; } } @@ -237,7 +241,11 @@ struct SplitTensorsForAllReduce { for (auto &tensor : *p_dense_tensors) { uint8_t *out_data = reinterpret_cast(tensor.data()); auto sz = tensor.numel() * sizeof(T); - device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream); + if (tensor.place().GetType() == phi::AllocationType::CPU) { + device->MemoryCopyD2H(out_data, in_data + offset, sz, &stream); + } else { + device->MemoryCopyD2D(out_data, in_data + offset, sz, &stream); + } offset += sz; } }