From 0ba4a234ef056a6a6b3ee3f2579589d0fa956516 Mon Sep 17 00:00:00 2001 From: Kai Song <50285351+USTCKAY@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:51:42 +0800 Subject: [PATCH] [Custom Device]add run_check support for custom device (#56318) * [Custom Dice]add run_check support for custom device * fix error msg * fix typo * update for all custom device * fix * add warning msg --- python/paddle/distributed/spawn.py | 27 ++++++++++++++ .../paddle/distributed/utils/launch_utils.py | 12 +++++++ python/paddle/utils/install_check.py | 36 ++++++++++++++++--- 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index b378e27e4d3..0bc68c34c20 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -110,6 +110,8 @@ def _get_default_nprocs(): return core.get_xpu_device_count() elif 'cpu' in device: return multiprocessing.cpu_count() + elif device in core.get_available_custom_device(): + return core.get_custom_device_count(device.split(":")[0]) else: raise RuntimeError( "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format( @@ -126,6 +128,8 @@ def _get_default_backend(): return 'bkcl' elif 'cpu' in device: return 'gloo' + elif device in core.get_available_custom_device(): + return 'xccl' else: raise RuntimeError( "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format( @@ -275,6 +279,29 @@ def _get_subprocess_env_list(nprocs, options): assert ( _get_trainers_num() == 1 ), "CPUONLY spawn doesn't support multi-trainer" + elif options['backend'] == 'xccl': + args.selected_devices = None + custom_device_name = core.get_all_custom_device_type()[0] + env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None) + if env_devices is None or env_devices == "": + env_devices_list = [ + str(x) + for x in range(core.get_custom_device_count(custom_device_name)) + ] + else: + env_devices_list = env_devices.split(',') + + if len(env_devices_list) < nprocs: + raise RuntimeError( + "the number of visible devices(%d) is less than the number " + "of spawn processes(%d), please ensure that the correct " + "`nprocs` argument is passed or the environment variable " + "`FLAGS_selected_%ss` is correctly configured." + % (len(env_devices_list), nprocs, custom_device_name) + ) + args.selected_devices = ",".join( + [str(env_devices_list[x]) for x in range(0, nprocs)] + ) # set other inner args args.node_ip = options.get('node_ip', None) diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py index b23beae8a63..54e265b068d 100644 --- a/python/paddle/distributed/utils/launch_utils.py +++ b/python/paddle/distributed/utils/launch_utils.py @@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None): "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_DISTRI_BACKEND": backend, # only add here, other will be auto } + elif backend == 'xccl': + from paddle.framework import core + + custom_device_name = core.get_all_custom_device_type()[0] + proc_env = { + f"FLAGS_selected_{custom_device_name}s": "%s" + % ",".join([str(g) for g in trainer.gpus]), + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + } else: raise ValueError("backend must be one of 'gloo, nccl, bkcl'") diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 9fd42947e0a..a803962f3e8 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -81,7 +81,7 @@ def _is_xpu_available(): return False -def _run_dygraph_single(use_cuda, use_xpu): +def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name): """ Testing the simple network in dygraph mode using one CPU/GPU/XPU. @@ -94,6 +94,8 @@ def _run_dygraph_single(use_cuda, use_xpu): paddle.set_device('gpu') elif use_xpu: paddle.set_device('xpu') + elif use_custom: + paddle.set_device(custom_device_name) else: paddle.set_device('cpu') weight_attr = paddle.ParamAttr( @@ -116,7 +118,7 @@ def _run_dygraph_single(use_cuda, use_xpu): opt.step() -def _run_static_single(use_cuda, use_xpu): +def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name): """ Testing the simple network with executor running directly, using one CPU/GPU/XPU. @@ -139,6 +141,8 @@ def _run_static_single(use_cuda, use_xpu): place = paddle.CUDAPlace(0) elif use_xpu: place = paddle.XPUPlace(0) + elif use_custom: + place = paddle.CustomPlace(custom_device_name, 0) else: place = paddle.CPUPlace() @@ -229,11 +233,21 @@ def run_check(): use_cuda = False use_xpu = False + use_custom = False + custom_device_name = None if paddle.is_compiled_with_cuda(): use_cuda = _is_cuda_available() elif paddle.is_compiled_with_xpu(): use_xpu = _is_xpu_available() + elif len(paddle.framework.core.get_all_custom_device_type()) > 0: + use_custom = True + if len(paddle.framework.core.get_all_custom_device_type()) > 1: + logging.warning( + "More than one kind of custom devices detected, but run check would only be executed on {}.".format( + paddle.framework.core.get_all_custom_device_type()[0] + ) + ) if use_cuda: device_str = "GPU" @@ -241,17 +255,31 @@ def run_check(): elif use_xpu: device_str = "XPU" device_list = paddle.static.xpu_places() + elif use_custom: + device_str = paddle.framework.core.get_all_custom_device_type()[0] + custom_device_name = device_str + device_list = list( + range( + paddle.framework.core.get_custom_device_count( + custom_device_name + ) + ) + ) else: device_str = "CPU" device_list = paddle.static.cpu_places(device_count=1) device_count = len(device_list) - _run_static_single(use_cuda, use_xpu) - _run_dygraph_single(use_cuda, use_xpu) + _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name) + _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name) print(f"PaddlePaddle works well on 1 {device_str}.") try: if len(device_list) > 1: + if use_custom: + import os + + os.environ['PADDLE_DISTRI_BACKEND'] = "xccl" _run_parallel(device_list) print( "PaddlePaddle works well on {} {}s.".format( -- GitLab