未验证 提交 0ba4a234 编写于 作者: K Kai Song 提交者: GitHub

[Custom Device]add run_check support for custom device (#56318)

* [Custom Dice]add run_check support for custom device

* fix error msg

* fix typo

* update for all custom device

* fix

* add warning msg
上级 2abf4326
......@@ -110,6 +110,8 @@ def _get_default_nprocs():
return core.get_xpu_device_count()
elif 'cpu' in device:
return multiprocessing.cpu_count()
elif device in core.get_available_custom_device():
return core.get_custom_device_count(device.split(":")[0])
else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
......@@ -126,6 +128,8 @@ def _get_default_backend():
return 'bkcl'
elif 'cpu' in device:
return 'gloo'
elif device in core.get_available_custom_device():
return 'xccl'
else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
......@@ -275,6 +279,29 @@ def _get_subprocess_env_list(nprocs, options):
assert (
_get_trainers_num() == 1
), "CPUONLY spawn doesn't support multi-trainer"
elif options['backend'] == 'xccl':
args.selected_devices = None
custom_device_name = core.get_all_custom_device_type()[0]
env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None)
if env_devices is None or env_devices == "":
env_devices_list = [
str(x)
for x in range(core.get_custom_device_count(custom_device_name))
]
else:
env_devices_list = env_devices.split(',')
if len(env_devices_list) < nprocs:
raise RuntimeError(
"the number of visible devices(%d) is less than the number "
"of spawn processes(%d), please ensure that the correct "
"`nprocs` argument is passed or the environment variable "
"`FLAGS_selected_%ss` is correctly configured."
% (len(env_devices_list), nprocs, custom_device_name)
)
args.selected_devices = ",".join(
[str(env_devices_list[x]) for x in range(0, nprocs)]
)
# set other inner args
args.node_ip = options.get('node_ip', None)
......
......@@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"PADDLE_DISTRI_BACKEND": backend, # only add here, other will be auto
}
elif backend == 'xccl':
from paddle.framework import core
custom_device_name = core.get_all_custom_device_type()[0]
proc_env = {
f"FLAGS_selected_{custom_device_name}s": "%s"
% ",".join([str(g) for g in trainer.gpus]),
"PADDLE_TRAINER_ID": "%d" % trainer.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
else:
raise ValueError("backend must be one of 'gloo, nccl, bkcl'")
......
......@@ -81,7 +81,7 @@ def _is_xpu_available():
return False
def _run_dygraph_single(use_cuda, use_xpu):
def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network in dygraph mode using one CPU/GPU/XPU.
......@@ -94,6 +94,8 @@ def _run_dygraph_single(use_cuda, use_xpu):
paddle.set_device('gpu')
elif use_xpu:
paddle.set_device('xpu')
elif use_custom:
paddle.set_device(custom_device_name)
else:
paddle.set_device('cpu')
weight_attr = paddle.ParamAttr(
......@@ -116,7 +118,7 @@ def _run_dygraph_single(use_cuda, use_xpu):
opt.step()
def _run_static_single(use_cuda, use_xpu):
def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network with executor running directly, using one CPU/GPU/XPU.
......@@ -139,6 +141,8 @@ def _run_static_single(use_cuda, use_xpu):
place = paddle.CUDAPlace(0)
elif use_xpu:
place = paddle.XPUPlace(0)
elif use_custom:
place = paddle.CustomPlace(custom_device_name, 0)
else:
place = paddle.CPUPlace()
......@@ -229,11 +233,21 @@ def run_check():
use_cuda = False
use_xpu = False
use_custom = False
custom_device_name = None
if paddle.is_compiled_with_cuda():
use_cuda = _is_cuda_available()
elif paddle.is_compiled_with_xpu():
use_xpu = _is_xpu_available()
elif len(paddle.framework.core.get_all_custom_device_type()) > 0:
use_custom = True
if len(paddle.framework.core.get_all_custom_device_type()) > 1:
logging.warning(
"More than one kind of custom devices detected, but run check would only be executed on {}.".format(
paddle.framework.core.get_all_custom_device_type()[0]
)
)
if use_cuda:
device_str = "GPU"
......@@ -241,17 +255,31 @@ def run_check():
elif use_xpu:
device_str = "XPU"
device_list = paddle.static.xpu_places()
elif use_custom:
device_str = paddle.framework.core.get_all_custom_device_type()[0]
custom_device_name = device_str
device_list = list(
range(
paddle.framework.core.get_custom_device_count(
custom_device_name
)
)
)
else:
device_str = "CPU"
device_list = paddle.static.cpu_places(device_count=1)
device_count = len(device_list)
_run_static_single(use_cuda, use_xpu)
_run_dygraph_single(use_cuda, use_xpu)
_run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
_run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
print(f"PaddlePaddle works well on 1 {device_str}.")
try:
if len(device_list) > 1:
if use_custom:
import os
os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
_run_parallel(device_list)
print(
"PaddlePaddle works well on {} {}s.".format(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册