未验证 提交 ea04bef8 编写于 作者: K Kim Yann 提交者: GitHub

rem cncl keyword in py (#52939)

上级 d2b0d63f
...@@ -757,7 +757,7 @@ def launch(): ...@@ -757,7 +757,7 @@ def launch():
check_backend(args.backend) check_backend(args.backend)
distribute_mode = DistributeMode.COLLECTIVE distribute_mode = DistributeMode.COLLECTIVE
# assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown'] # assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
if args.backend == 'gloo': if args.backend == 'gloo':
logger.warning("launch start with CPUONLY mode") logger.warning("launch start with CPUONLY mode")
......
...@@ -1986,7 +1986,6 @@ def check_backend(backend): ...@@ -1986,7 +1986,6 @@ def check_backend(backend):
'nccl', 'nccl',
'gloo', 'gloo',
'bkcl', 'bkcl',
'cncl',
'auto', 'auto',
'heter', 'heter',
'xccl', 'xccl',
......
...@@ -886,7 +886,7 @@ def _start_kv_server(port, http_server_d, size): ...@@ -886,7 +886,7 @@ def _start_kv_server(port, http_server_d, size):
def _is_cpuonly(backend): def _is_cpuonly(backend):
check_backend(backend) check_backend(backend)
if ( if (
backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl'] backend in ['auto', 'nccl', 'bkcl', 'heter']
and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()) and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu())
) or backend == 'xccl': ) or backend == 'xccl':
......
...@@ -126,8 +126,6 @@ def _get_default_backend(): ...@@ -126,8 +126,6 @@ def _get_default_backend():
return 'nccl' return 'nccl'
elif 'xpu' in device: elif 'xpu' in device:
return 'bkcl' return 'bkcl'
elif 'mlu' in device:
return 'cncl'
elif 'cpu' in device: elif 'cpu' in device:
return 'gloo' return 'gloo'
else: else:
...@@ -259,45 +257,6 @@ def _get_subprocess_env_list(nprocs, options): ...@@ -259,45 +257,6 @@ def _get_subprocess_env_list(nprocs, options):
"XPU_VISIBLE_DEVICES (%s)." "XPU_VISIBLE_DEVICES (%s)."
% (card_id, ",".join(env_devices_list)) % (card_id, ",".join(env_devices_list))
) )
elif options['backend'] == 'cncl':
args.selected_devices = options.get('mlus', None)
if args.selected_devices is None:
args.selected_devices = options.get('selected_devices', None)
env_devices = os.getenv("MLU_VISIBLE_DEVICES", None)
if env_devices is None or env_devices == "":
env_devices_list = [
str(x) for x in range(core.get_custom_device_count('mlu'))
]
else:
env_devices_list = env_devices.split(',')
if args.selected_devices is None:
if len(env_devices_list) < nprocs:
raise RuntimeError(
"the number of visible devices(%d) is less than the number "
"of spawn processes(%d), please ensure that the correct "
"`nprocs` argument is passed or the environment variable "
"`MLU_VISIBLE_DEVICES` is correctly configured."
% (len(env_devices_list), nprocs)
)
args.selected_devices = ",".join(
[str(env_devices_list[x]) for x in range(0, nprocs)]
)
else:
selected_device_list = args.selected_devices.split(',')
if len(selected_device_list) != nprocs:
raise ValueError(
"The number of selected devices(%s) is not equal to "
"the number of spawn processes(%d), please ensure that the "
"correct `nprocs` and `mlus` arguments are passed."
% (len(selected_device_list), nprocs)
)
for card_id in selected_device_list:
if card_id not in env_devices_list:
raise ValueError(
"The selected mlu card %s cannot found in "
"MLU_VISIBLE_DEVICES (%s)."
% (card_id, ",".join(env_devices_list))
)
elif options['backend'] == 'gloo': elif options['backend'] == 'gloo':
# TODO check gpu / xpu flag must not exist # TODO check gpu / xpu flag must not exist
warnings.warn( warnings.warn(
...@@ -372,8 +331,6 @@ def _set_trainer_env(env_dict, backend): ...@@ -372,8 +331,6 @@ def _set_trainer_env(env_dict, backend):
set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']}) set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']})
elif backend == 'bkcl': elif backend == 'bkcl':
set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']}) set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']})
elif backend == 'cncl':
set_flags({'FLAGS_selected_mlus': env_dict['FLAGS_selected_mlus']})
else: else:
# NOTE(xiongkun) why not raise Error ? # NOTE(xiongkun) why not raise Error ?
# So far, we added support for CPU parallel, and will be applied when paddle is not # So far, we added support for CPU parallel, and will be applied when paddle is not
......
...@@ -427,15 +427,6 @@ def _prepare_trainer_env(cluster, trainer, backend=None): ...@@ -427,15 +427,6 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
} }
elif backend == 'cncl':
proc_env = {
"FLAGS_selected_mlus": "%s"
% ",".join([str(g) for g in trainer.gpus]),
"PADDLE_TRAINER_ID": "%d" % trainer.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
elif backend == 'gloo': elif backend == 'gloo':
# NOTE (xiongkun) default fall back into cpu only # NOTE (xiongkun) default fall back into cpu only
proc_env = { proc_env = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册