[Custom Device]add run_check support for custom device (#56318)

* [Custom Dice]add run_check support for custom device * fix error msg * fix typo * update for all custom device * fix * add warning msg

[Custom Device]add run_check support for custom device (#56318)
* [Custom Dice]add run_check support for custom device * fix error msg * fix typo * update for all custom device * fix * add warning msg
0ba4a234 · Kai Song · GitHub · 2abf4326 · 0ba4a234 · 0ba4a234
3 changed file
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -110,6 +110,8 @@ def _get_default_nprocs():
        return core.get_xpu_device_count()
    elif 'cpu' in device:
        return multiprocessing.cpu_count()
+    elif device in core.get_available_custom_device():
+        return core.get_custom_device_count(device.split(":")[0])
    else:
        raise RuntimeError(
            "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
@@ -126,6 +128,8 @@ def _get_default_backend():
        return 'bkcl'
    elif 'cpu' in device:
        return 'gloo'
+    elif device in core.get_available_custom_device():
+        return 'xccl'
    else:
        raise RuntimeError(
            "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
@@ -275,6 +279,29 @@ def _get_subprocess_env_list(nprocs, options):
        assert (
            _get_trainers_num() == 1
        ), "CPUONLY spawn doesn't support multi-trainer"
+    elif options['backend'] == 'xccl':
+        args.selected_devices = None
+        custom_device_name = core.get_all_custom_device_type()[0]
+        env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None)
+        if env_devices is None or env_devices == "":
+            env_devices_list = [
+                str(x)
+                for x in range(core.get_custom_device_count(custom_device_name))
+            ]
+        else:
+            env_devices_list = env_devices.split(',')
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`FLAGS_selected_%ss` is correctly configured."
+                % (len(env_devices_list), nprocs, custom_device_name)
+            )
+        args.selected_devices = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)]
+        )
    # set other inner args
    args.node_ip = options.get('node_ip', None)

--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
            "PADDLE_DISTRI_BACKEND": backend,  # only add here, other will be auto
        }
+    elif backend == 'xccl':
+        from paddle.framework import core
+        custom_device_name = core.get_all_custom_device_type()[0]
+        proc_env = {
+            f"FLAGS_selected_{custom_device_name}s": "%s"
+            % ",".join([str(g) for g in trainer.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+        }
    else:
        raise ValueError("backend must be one of 'gloo, nccl, bkcl'")

--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -81,7 +81,7 @@ def _is_xpu_available():
        return False
-def _run_dygraph_single(use_cuda, use_xpu):
+def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
    """
    Testing the simple network in dygraph mode using one CPU/GPU/XPU.
@@ -94,6 +94,8 @@ def _run_dygraph_single(use_cuda, use_xpu):
        paddle.set_device('gpu')
    elif use_xpu:
        paddle.set_device('xpu')
+    elif use_custom:
+        paddle.set_device(custom_device_name)
    else:
        paddle.set_device('cpu')
    weight_attr = paddle.ParamAttr(
@@ -116,7 +118,7 @@ def _run_dygraph_single(use_cuda, use_xpu):
    opt.step()
-def _run_static_single(use_cuda, use_xpu):
+def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
    """
    Testing the simple network with executor running directly, using one CPU/GPU/XPU.
@@ -139,6 +141,8 @@ def _run_static_single(use_cuda, use_xpu):
            place = paddle.CUDAPlace(0)
        elif use_xpu:
            place = paddle.XPUPlace(0)
+        elif use_custom:
+            place = paddle.CustomPlace(custom_device_name, 0)
        else:
            place = paddle.CPUPlace()
@@ -229,11 +233,21 @@ def run_check():
    use_cuda = False
    use_xpu = False
+    use_custom = False
+    custom_device_name = None
    if paddle.is_compiled_with_cuda():
        use_cuda = _is_cuda_available()
    elif paddle.is_compiled_with_xpu():
        use_xpu = _is_xpu_available()
+    elif len(paddle.framework.core.get_all_custom_device_type()) > 0:
+        use_custom = True
+        if len(paddle.framework.core.get_all_custom_device_type()) > 1:
+            logging.warning(
+                "More than one kind of custom devices detected, but run check would only be executed on {}.".format(
+                    paddle.framework.core.get_all_custom_device_type()[0]
+                )
+            )
    if use_cuda:
        device_str = "GPU"
@@ -241,17 +255,31 @@ def run_check():
    elif use_xpu:
        device_str = "XPU"
        device_list = paddle.static.xpu_places()
+    elif use_custom:
+        device_str = paddle.framework.core.get_all_custom_device_type()[0]
+        custom_device_name = device_str
+        device_list = list(
+            range(
+                paddle.framework.core.get_custom_device_count(
+                    custom_device_name
+                )
+            )
+        )
    else:
        device_str = "CPU"
        device_list = paddle.static.cpu_places(device_count=1)
    device_count = len(device_list)
-    _run_static_single(use_cuda, use_xpu)
+    _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
-    _run_dygraph_single(use_cuda, use_xpu)
+    _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
    print(f"PaddlePaddle works well on 1 {device_str}.")
    try:
        if len(device_list) > 1:
+            if use_custom:
+                import os
+                os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
            _run_parallel(device_list)
            print(
                "PaddlePaddle works well on {} {}s.".format(