[CustomDevice] fix recompute (#53718)

2f56b6da · ronnywang · GitHub · 793f3b93 · 2f56b6da · 2f56b6da
2 changed file
--- a/paddle/fluid/distributed/collective/process_group_custom.cc
+++ b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -125,12 +125,14 @@ void ProcessGroupCustom::BroadcastUniqueCustomID(
    std::vector<phi::ccl::CCLRootId>& ccl_ids) {  // NOLINT
  if (rank_ == 0) {
    for (size_t i = 0; i < ccl_ids.size(); i++) {
-      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
      store_->set(key, ccl_ids[i]);
    }
  } else {
    for (size_t i = 0; i < ccl_ids.size(); i++) {
-      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
+      auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
+                 std::to_string(i);
      ccl_ids[i] = store_->get(key);
    }
  }

--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -110,7 +110,10 @@ class _HPRecomputeFunction(PyLayer):
        cur_device = paddle.get_device()
        assert (
-            'gpu:' in paddle.get_device() or 'xpu:' in paddle.get_device()
+            'gpu:' in paddle.get_device()
+            or 'xpu:' in paddle.get_device()
+            or cur_device.split(':')[0]
+            in paddle.device.get_all_custom_device_type()
        ), "Recompute with RNG is not support current device: {}.".format(
            cur_device
        )