未验证 提交 ea04bef8 编写于 作者: K Kim Yann 提交者: GitHub

rem cncl keyword in py (#52939)

上级 d2b0d63f
......@@ -757,7 +757,7 @@ def launch():
check_backend(args.backend)
distribute_mode = DistributeMode.COLLECTIVE
# assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown']
# assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
if args.backend == 'gloo':
logger.warning("launch start with CPUONLY mode")
......
......@@ -1986,7 +1986,6 @@ def check_backend(backend):
'nccl',
'gloo',
'bkcl',
'cncl',
'auto',
'heter',
'xccl',
......
......@@ -886,7 +886,7 @@ def _start_kv_server(port, http_server_d, size):
def _is_cpuonly(backend):
check_backend(backend)
if (
backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl']
backend in ['auto', 'nccl', 'bkcl', 'heter']
and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
) or backend == 'xccl':
......
......@@ -126,8 +126,6 @@ def _get_default_backend():
return 'nccl'
elif 'xpu' in device:
return 'bkcl'
elif 'mlu' in device:
return 'cncl'
elif 'cpu' in device:
return 'gloo'
else:
......@@ -259,45 +257,6 @@ def _get_subprocess_env_list(nprocs, options):
"XPU_VISIBLE_DEVICES (%s)."
% (card_id, ",".join(env_devices_list))
)
elif options['backend'] == 'cncl':
args.selected_devices = options.get('mlus', None)
if args.selected_devices is None:
args.selected_devices = options.get('selected_devices', None)
env_devices = os.getenv("MLU_VISIBLE_DEVICES", None)
if env_devices is None or env_devices == "":
env_devices_list = [
str(x) for x in range(core.get_custom_device_count('mlu'))
]
else:
env_devices_list = env_devices.split(',')
if args.selected_devices is None:
if len(env_devices_list) < nprocs:
raise RuntimeError(
"the number of visible devices(%d) is less than the number "
"of spawn processes(%d), please ensure that the correct "
"`nprocs` argument is passed or the environment variable "
"`MLU_VISIBLE_DEVICES` is correctly configured."
% (len(env_devices_list), nprocs)
)
args.selected_devices = ",".join(
[str(env_devices_list[x]) for x in range(0, nprocs)]
)
else:
selected_device_list = args.selected_devices.split(',')
if len(selected_device_list) != nprocs:
raise ValueError(
"The number of selected devices(%s) is not equal to "
"the number of spawn processes(%d), please ensure that the "
"correct `nprocs` and `mlus` arguments are passed."
% (len(selected_device_list), nprocs)
)
for card_id in selected_device_list:
if card_id not in env_devices_list:
raise ValueError(
"The selected mlu card %s cannot found in "
"MLU_VISIBLE_DEVICES (%s)."
% (card_id, ",".join(env_devices_list))
)
elif options['backend'] == 'gloo':
# TODO check gpu / xpu flag must not exist
warnings.warn(
......@@ -372,8 +331,6 @@ def _set_trainer_env(env_dict, backend):
set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']})
elif backend == 'bkcl':
set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']})
elif backend == 'cncl':
set_flags({'FLAGS_selected_mlus': env_dict['FLAGS_selected_mlus']})
else:
# NOTE(xiongkun) why not raise Error ?
# So far, we added support for CPU parallel, and will be applied when paddle is not
......
......@@ -427,15 +427,6 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
elif backend == 'cncl':
proc_env = {
"FLAGS_selected_mlus": "%s"
% ",".join([str(g) for g in trainer.gpus]),
"PADDLE_TRAINER_ID": "%d" % trainer.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
elif backend == 'gloo':
# NOTE (xiongkun) default fall back into cpu only
proc_env = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册