提交 dee5a10a 编写于 作者: M Megvii Engine Team

feat(distributed): auto detect device and backend when init group

GitOrigin-RevId: 90be2d5b4d97f1379b70ffdcbac61269c1d44848
上级 1bec737d
......@@ -12,6 +12,7 @@ from typing import Optional
from .core._imperative_rt.common import CompNode, DeviceType
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
__all__ = [
"is_cuda_available",
......@@ -25,7 +26,7 @@ __all__ = [
def _valid_device(inp):
if isinstance(inp, str) and re.match("^[cxg]pu(\d+|\d+:\d+|x)$", inp):
if isinstance(inp, str) and re.match("^([cxg]pu|rocm)(\d+|\d+:\d+|x)$", inp):
return True
return False
......@@ -40,21 +41,24 @@ def _str2device_type(type_str: str, allow_unspec: bool = True):
return DeviceType.CAMBRICON
elif type_str == "ATLAS":
return DeviceType.ATLAS
elif type_str == "ROCM" or type_str == "AMDGPU":
return DeviceType.ROCM
else:
assert allow_unspec and str == "XPU", "device type can only be cpu, gpu or xpu"
return DeviceType.UNSPEC
_device_type_set = {"cpu", "gpu", "xpu", "rocm"}
def get_device_count(device_type: str) -> int:
"""
Gets number of devices installed on this system.
:param device_type: device type, one of 'gpu' or 'cpu'
"""
device_type_set = ("cpu", "gpu")
assert device_type in device_type_set, "device must be one of {}".format(
device_type_set
assert device_type in _device_type_set, "device must be one of {}".format(
_device_type_set
)
device_type = _str2device_type(device_type)
return CompNode._get_device_count(device_type, False)
......@@ -87,6 +91,14 @@ def is_atlas_available() -> bool:
return CompNode._get_device_count(t, False) > 0
def is_rocm_available() -> bool:
"""Returns whether rocm device is available on this system.
"""
t = _str2device_type("rocm")
return CompNode._get_device_count(t, False) > 0
def set_default_device(device: str = "xpux"):
r"""
Sets default computing node.
......@@ -151,3 +163,7 @@ def set_prealloc_config(
assert max_overhead >= 0
assert growth_factor >= 1
_set_prealloc_config(alignment, min_req, max_overhead, growth_factor, device_type)
def what_is_xpu():
return _what_is_xpu().name.lower()
......@@ -8,7 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from typing import List, Optional, Tuple
from ..device import set_default_device
from ..device import set_default_device, what_is_xpu
from .server import Client, Server
......@@ -23,6 +23,7 @@ class StaticData:
device = None
backend = None
next_stream = None
device_type = None
_sd = None
......@@ -78,19 +79,29 @@ class Group:
@property
def comp_node(self):
assert len(self.proc_ranks) > 0, "invalid group"
return "gpu{}:{}".format(_sd.device, self.stream)
return "{}{}:{}".format(_sd.device_type, _sd.device, self.stream)
WORLD = Group([])
_device2backend = {
"gpu": "nccl",
"cuda": "nccl",
"rocm": "rccl",
}
_backends = {"nccl", "rccl", "ucx"}
def init_process_group(
master_ip: str,
port: int,
world_size: int,
rank: int,
device: int,
backend: Optional[str] = "nccl",
backend: Optional[str] = None,
device_type: str = "xpu",
) -> None:
"""
Initialize the distributed process group and specify the device used in the current process
......@@ -102,6 +113,8 @@ def init_process_group(
:param device: the GPU device id to bind this process to.
:param backend: communicator backend, currently support 'nccl' and 'ucx'.
"""
physical_device_type = what_is_xpu() if device_type == "xpu" else device_type
backend = _device2backend[physical_device_type] if backend is None else backend
if not isinstance(master_ip, str):
raise TypeError("Expect type str but got {}".format(type(master_ip)))
if not isinstance(port, int):
......@@ -112,8 +125,14 @@ def init_process_group(
raise TypeError("Expect type int but got {}".format(type(rank)))
if not isinstance(device, int):
raise TypeError("Expect type int but got {}".format(type(backend)))
if not isinstance(backend, str):
raise TypeError("Expect type str but got {}".format(type(backend)))
if backend not in _backends:
raise ValueError(
"backend should be one of {} but got {}".format(_backends, backend)
)
if physical_device_type not in _device2backend:
raise ValueError(
"{} is not a valid distributed device type".format(device_type)
)
global _sd
assert _sd is None, "init_process_group should be called only once"
......@@ -132,10 +151,11 @@ def init_process_group(
_sd.device = device
_sd.backend = backend
_sd.next_stream = 1
_sd.device_type = device_type
WORLD.reset(list(range(world_size)))
set_default_device("gpu{}".format(device))
set_default_device("{}{}".format(device_type, device))
def is_distributed() -> bool:
......@@ -182,7 +202,7 @@ def new_group(proc_ranks: List[int]) -> Group:
return Group(proc_ranks)
def group_barrier(group: Optional[Group] = WORLD) -> None:
def group_barrier(group: Group = WORLD) -> None:
"""Block until all ranks in the group reach this barrier."""
# if running with single node, skip it
if _sd is None:
......
......@@ -29,13 +29,19 @@ def _run_wrapped(
world_size,
rank,
dev,
device_type,
args,
kwargs,
queue: mp.Queue,
):
"""Init distributed process group and run wrapped function."""
init_process_group(
master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
master_ip=master_ip,
port=port,
world_size=world_size,
rank=rank,
device=dev,
device_type=device_type,
)
if is_multimachine:
group_barrier()
......@@ -70,13 +76,17 @@ class launcher:
rank_start=0,
master_ip="localhost",
port=0,
device_type="xpu",
):
self.func = func
self.n_gpus = n_gpus if n_gpus is not None else get_device_count_by_fork("gpu")
self.n_gpus = (
n_gpus if n_gpus is not None else get_device_count_by_fork(device_type)
)
self.world_size = world_size if world_size is not None else self.n_gpus
self.rank_start = rank_start
self.master_ip = master_ip
self.port = port
self.device_type = device_type
# master node create server
if self.rank_start == 0:
self.server = Server(self.port)
......@@ -99,6 +109,7 @@ class launcher:
self.world_size,
dev + self.rank_start,
dev,
self.device_type,
args,
kwargs,
queue,
......
......@@ -62,8 +62,8 @@ void init_common(py::module m) {
return cn.get_mem_status_bytes();
})
.def("create_event", &CompNode::create_event, py::arg("flags") = 0ul)
.def("_set_default_device", &set_default_device)
.def("_get_default_device", &get_default_device)
.def_static("_set_default_device", &set_default_device)
.def_static("_get_default_device", &get_default_device)
.def("__str__", &CompNode::to_string_logical)
.def("__repr__", [](const CompNode& cn) {
return py::str("\"" + cn.to_string() + "\" from \"" + cn.to_string_logical() + "\"");
......@@ -179,6 +179,10 @@ void init_common(py::module m) {
m.def("set_prealloc_config", &CompNode::set_prealloc_config,
"specifies how to pre-allocate from raw dev allocator");
m.def("what_is_xpu", []{
return CompNode::Locator::parse("xpux").to_physical().type;
});
init_npy_num_bfloat16(m);
init_npy_num_intbx(m);
init_dtypes(m);
......
......@@ -16,6 +16,7 @@ import pytest
import megengine as mge
import megengine.distributed as dist
from megengine.core.ops.builtin import CollectiveComm, ParamPackConcat, ParamPackSplit
from megengine.device import get_default_device
from megengine.distributed.helper import (
get_device_count_by_fork,
param_pack_concat,
......@@ -87,7 +88,8 @@ def test_new_group():
assert group.size == 2
assert group.key == "2,0"
assert group.rank == ranks.index(rank)
assert group.comp_node == "gpu{}:2".format(rank)
dt = get_default_device()[:-1]
assert group.comp_node == "{}{}:2".format(dt, rank)
worker()
......
......@@ -236,12 +236,12 @@ def test_io_remote(shape):
def worker(val, shape):
rank = dist.get_rank()
if rank == 0: # remote send
x = tensor(val, device="gpu0")
x = tensor(val, device="xpu0")
remote_send(x, 1)
sync()
else: # remote recv
y = remote_recv(0, shape, np.float32)
assert y.device == "gpu1"
assert y.device == get_default_device()
np.testing.assert_almost_equal(val, y.numpy())
val = np.random.random_sample(shape).astype("float32")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册