diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index a55a47943b3d7bc44d8b5d62ee0cf70124767703..4a334281e90d12ab4a34752eba719425749ad537 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -757,7 +757,7 @@ def launch(): check_backend(args.backend) distribute_mode = DistributeMode.COLLECTIVE - # assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown'] + # assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown'] if args.backend == 'gloo': logger.warning("launch start with CPUONLY mode") diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 2638a42a6c5cc7ca3c2a83124ad25631cec72565..a1a6cdb9c636f4648ab8bcc84dc6bb46bedc07b6 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -1986,7 +1986,6 @@ def check_backend(backend): 'nccl', 'gloo', 'bkcl', - 'cncl', 'auto', 'heter', 'xccl', diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 24fdff7a7b0309393b0008da82d8ec3282ace046..2046cf08eb06ec88f1dcbc6e946ee00c3f72b676 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -886,7 +886,7 @@ def _start_kv_server(port, http_server_d, size): def _is_cpuonly(backend): check_backend(backend) if ( - backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl'] + backend in ['auto', 'nccl', 'bkcl', 'heter'] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()) ) or backend == 'xccl': diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 38b679001dde14d039797ef1ecd48fefb9176cdf..2a4ede33866e2dc5bb3fbe3db7b860a2c57f6aa7 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -126,8 +126,6 @@ def _get_default_backend(): return 'nccl' elif 'xpu' in device: return 'bkcl' - elif 'mlu' in device: - return 'cncl' elif 'cpu' in device: return 'gloo' else: @@ -259,45 +257,6 @@ def _get_subprocess_env_list(nprocs, options): "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list)) ) - elif options['backend'] == 'cncl': - args.selected_devices = options.get('mlus', None) - if args.selected_devices is None: - args.selected_devices = options.get('selected_devices', None) - env_devices = os.getenv("MLU_VISIBLE_DEVICES", None) - if env_devices is None or env_devices == "": - env_devices_list = [ - str(x) for x in range(core.get_custom_device_count('mlu')) - ] - else: - env_devices_list = env_devices.split(',') - if args.selected_devices is None: - if len(env_devices_list) < nprocs: - raise RuntimeError( - "the number of visible devices(%d) is less than the number " - "of spawn processes(%d), please ensure that the correct " - "`nprocs` argument is passed or the environment variable " - "`MLU_VISIBLE_DEVICES` is correctly configured." - % (len(env_devices_list), nprocs) - ) - args.selected_devices = ",".join( - [str(env_devices_list[x]) for x in range(0, nprocs)] - ) - else: - selected_device_list = args.selected_devices.split(',') - if len(selected_device_list) != nprocs: - raise ValueError( - "The number of selected devices(%s) is not equal to " - "the number of spawn processes(%d), please ensure that the " - "correct `nprocs` and `mlus` arguments are passed." - % (len(selected_device_list), nprocs) - ) - for card_id in selected_device_list: - if card_id not in env_devices_list: - raise ValueError( - "The selected mlu card %s cannot found in " - "MLU_VISIBLE_DEVICES (%s)." - % (card_id, ",".join(env_devices_list)) - ) elif options['backend'] == 'gloo': # TODO check gpu / xpu flag must not exist warnings.warn( @@ -372,8 +331,6 @@ def _set_trainer_env(env_dict, backend): set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']}) elif backend == 'bkcl': set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']}) - elif backend == 'cncl': - set_flags({'FLAGS_selected_mlus': env_dict['FLAGS_selected_mlus']}) else: # NOTE(xiongkun) why not raise Error ? # So far, we added support for CPU parallel, and will be applied when paddle is not diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py index 50092500534ab5f5c867fb7c70a21863827c0e35..634188ab8e51e6dc7c5b6b092e8d83f5d2002d94 100644 --- a/python/paddle/distributed/utils/launch_utils.py +++ b/python/paddle/distributed/utils/launch_utils.py @@ -427,15 +427,6 @@ def _prepare_trainer_env(cluster, trainer, backend=None): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), } - elif backend == 'cncl': - proc_env = { - "FLAGS_selected_mlus": "%s" - % ",".join([str(g) for g in trainer.gpus]), - "PADDLE_TRAINER_ID": "%d" % trainer.rank, - "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, - "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), - } elif backend == 'gloo': # NOTE (xiongkun) default fall back into cpu only proc_env = {