Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
dee5a10a
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
dee5a10a
编写于
4年前
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(distributed): auto detect device and backend when init group
GitOrigin-RevId: 90be2d5b4d97f1379b70ffdcbac61269c1d44848
上级
1bec737d
master
HuaHua404-patch-1
HuaHua404-patch-2
HuaHua404-patch-3
HuaHua404-patch-4
add-ci
add-dingding-notification
add-tools
copybara-target
dev-support-lite-fork-debug-mode
docstring-reshape
release-1.10
release-1.11
release-1.11.1
release-1.12.0
release-1.12.1
release-1.12.2
release-1.12.3
release-1.12.4
release-1.12.5
release-1.13.0
release-1.13.1
release-1.5
release-1.6
release-1.7
release-1.8
release-1.9
revert-211-master
revert-410-docstring-zeros
revert-411-add-tools
test-try-import
tmp-test
try-import
v1.13.1
v1.13.0
v1.12.4
v1.12.3
v1.12.2
v1.12.1
v1.12.0
v1.11.1
v1.11.0
v1.10.0
v1.9.1
v1.9.0
v1.8.2
v1.8.1
v1.8.1.m1
v1.8.0
v1.7.2.m1
v1.7.1.m1
v1.7.0
v1.7.0.m1
v1.6.0
v1.6.0-rc1
v1.5.0
无相关合并请求
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
72 addition
and
19 deletion
+72
-19
imperative/python/megengine/device.py
imperative/python/megengine/device.py
+21
-5
imperative/python/megengine/distributed/group.py
imperative/python/megengine/distributed/group.py
+27
-7
imperative/python/megengine/distributed/launcher.py
imperative/python/megengine/distributed/launcher.py
+13
-2
imperative/python/src/common.cpp
imperative/python/src/common.cpp
+6
-2
imperative/python/test/unit/distributed/test_distributed.py
imperative/python/test/unit/distributed/test_distributed.py
+3
-1
imperative/python/test/unit/functional/test_functional_distributed.py
...ython/test/unit/functional/test_functional_distributed.py
+2
-2
未找到文件。
imperative/python/megengine/device.py
浏览文件 @
dee5a10a
...
...
@@ -12,6 +12,7 @@ from typing import Optional
from
.core._imperative_rt.common
import
CompNode
,
DeviceType
from
.core._imperative_rt.common
import
set_prealloc_config
as
_set_prealloc_config
from
.core._imperative_rt.common
import
what_is_xpu
as
_what_is_xpu
__all__
=
[
"is_cuda_available"
,
...
...
@@ -25,7 +26,7 @@ __all__ = [
def
_valid_device
(
inp
):
if
isinstance
(
inp
,
str
)
and
re
.
match
(
"^
[cxg]pu
(\d+|\d+:\d+|x)$"
,
inp
):
if
isinstance
(
inp
,
str
)
and
re
.
match
(
"^
([cxg]pu|rocm)
(\d+|\d+:\d+|x)$"
,
inp
):
return
True
return
False
...
...
@@ -40,21 +41,24 @@ def _str2device_type(type_str: str, allow_unspec: bool = True):
return
DeviceType
.
CAMBRICON
elif
type_str
==
"ATLAS"
:
return
DeviceType
.
ATLAS
elif
type_str
==
"ROCM"
or
type_str
==
"AMDGPU"
:
return
DeviceType
.
ROCM
else
:
assert
allow_unspec
and
str
==
"XPU"
,
"device type can only be cpu, gpu or xpu"
return
DeviceType
.
UNSPEC
_device_type_set
=
{
"cpu"
,
"gpu"
,
"xpu"
,
"rocm"
}
def
get_device_count
(
device_type
:
str
)
->
int
:
"""
Gets number of devices installed on this system.
:param device_type: device type, one of 'gpu' or 'cpu'
"""
device_type_set
=
(
"cpu"
,
"gpu"
)
assert
device_type
in
device_type_set
,
"device must be one of {}"
.
format
(
device_type_set
assert
device_type
in
_device_type_set
,
"device must be one of {}"
.
format
(
_device_type_set
)
device_type
=
_str2device_type
(
device_type
)
return
CompNode
.
_get_device_count
(
device_type
,
False
)
...
...
@@ -87,6 +91,14 @@ def is_atlas_available() -> bool:
return
CompNode
.
_get_device_count
(
t
,
False
)
>
0
def
is_rocm_available
()
->
bool
:
"""Returns whether rocm device is available on this system.
"""
t
=
_str2device_type
(
"rocm"
)
return
CompNode
.
_get_device_count
(
t
,
False
)
>
0
def
set_default_device
(
device
:
str
=
"xpux"
):
r
"""
Sets default computing node.
...
...
@@ -151,3 +163,7 @@ def set_prealloc_config(
assert
max_overhead
>=
0
assert
growth_factor
>=
1
_set_prealloc_config
(
alignment
,
min_req
,
max_overhead
,
growth_factor
,
device_type
)
def
what_is_xpu
():
return
_what_is_xpu
().
name
.
lower
()
This diff is collapsed.
Click to expand it.
imperative/python/megengine/distributed/group.py
浏览文件 @
dee5a10a
...
...
@@ -8,7 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from
typing
import
List
,
Optional
,
Tuple
from
..device
import
set_default_device
from
..device
import
set_default_device
,
what_is_xpu
from
.server
import
Client
,
Server
...
...
@@ -23,6 +23,7 @@ class StaticData:
device
=
None
backend
=
None
next_stream
=
None
device_type
=
None
_sd
=
None
...
...
@@ -78,19 +79,29 @@ class Group:
@
property
def
comp_node
(
self
):
assert
len
(
self
.
proc_ranks
)
>
0
,
"invalid group"
return
"
gpu{}:{}"
.
format
(
_sd
.
device
,
self
.
stream
)
return
"
{}{}:{}"
.
format
(
_sd
.
device_type
,
_sd
.
device
,
self
.
stream
)
WORLD
=
Group
([])
_device2backend
=
{
"gpu"
:
"nccl"
,
"cuda"
:
"nccl"
,
"rocm"
:
"rccl"
,
}
_backends
=
{
"nccl"
,
"rccl"
,
"ucx"
}
def
init_process_group
(
master_ip
:
str
,
port
:
int
,
world_size
:
int
,
rank
:
int
,
device
:
int
,
backend
:
Optional
[
str
]
=
"nccl"
,
backend
:
Optional
[
str
]
=
None
,
device_type
:
str
=
"xpu"
,
)
->
None
:
"""
Initialize the distributed process group and specify the device used in the current process
...
...
@@ -102,6 +113,8 @@ def init_process_group(
:param device: the GPU device id to bind this process to.
:param backend: communicator backend, currently support 'nccl' and 'ucx'.
"""
physical_device_type
=
what_is_xpu
()
if
device_type
==
"xpu"
else
device_type
backend
=
_device2backend
[
physical_device_type
]
if
backend
is
None
else
backend
if
not
isinstance
(
master_ip
,
str
):
raise
TypeError
(
"Expect type str but got {}"
.
format
(
type
(
master_ip
)))
if
not
isinstance
(
port
,
int
):
...
...
@@ -112,8 +125,14 @@ def init_process_group(
raise
TypeError
(
"Expect type int but got {}"
.
format
(
type
(
rank
)))
if
not
isinstance
(
device
,
int
):
raise
TypeError
(
"Expect type int but got {}"
.
format
(
type
(
backend
)))
if
not
isinstance
(
backend
,
str
):
raise
TypeError
(
"Expect type str but got {}"
.
format
(
type
(
backend
)))
if
backend
not
in
_backends
:
raise
ValueError
(
"backend should be one of {} but got {}"
.
format
(
_backends
,
backend
)
)
if
physical_device_type
not
in
_device2backend
:
raise
ValueError
(
"{} is not a valid distributed device type"
.
format
(
device_type
)
)
global
_sd
assert
_sd
is
None
,
"init_process_group should be called only once"
...
...
@@ -132,10 +151,11 @@ def init_process_group(
_sd
.
device
=
device
_sd
.
backend
=
backend
_sd
.
next_stream
=
1
_sd
.
device_type
=
device_type
WORLD
.
reset
(
list
(
range
(
world_size
)))
set_default_device
(
"
gpu{}"
.
format
(
device
))
set_default_device
(
"
{}{}"
.
format
(
device_type
,
device
))
def
is_distributed
()
->
bool
:
...
...
@@ -182,7 +202,7 @@ def new_group(proc_ranks: List[int]) -> Group:
return
Group
(
proc_ranks
)
def
group_barrier
(
group
:
Optional
[
Group
]
=
WORLD
)
->
None
:
def
group_barrier
(
group
:
Group
=
WORLD
)
->
None
:
"""Block until all ranks in the group reach this barrier."""
# if running with single node, skip it
if
_sd
is
None
:
...
...
This diff is collapsed.
Click to expand it.
imperative/python/megengine/distributed/launcher.py
浏览文件 @
dee5a10a
...
...
@@ -29,13 +29,19 @@ def _run_wrapped(
world_size
,
rank
,
dev
,
device_type
,
args
,
kwargs
,
queue
:
mp
.
Queue
,
):
"""Init distributed process group and run wrapped function."""
init_process_group
(
master_ip
=
master_ip
,
port
=
port
,
world_size
=
world_size
,
rank
=
rank
,
device
=
dev
master_ip
=
master_ip
,
port
=
port
,
world_size
=
world_size
,
rank
=
rank
,
device
=
dev
,
device_type
=
device_type
,
)
if
is_multimachine
:
group_barrier
()
...
...
@@ -70,13 +76,17 @@ class launcher:
rank_start
=
0
,
master_ip
=
"localhost"
,
port
=
0
,
device_type
=
"xpu"
,
):
self
.
func
=
func
self
.
n_gpus
=
n_gpus
if
n_gpus
is
not
None
else
get_device_count_by_fork
(
"gpu"
)
self
.
n_gpus
=
(
n_gpus
if
n_gpus
is
not
None
else
get_device_count_by_fork
(
device_type
)
)
self
.
world_size
=
world_size
if
world_size
is
not
None
else
self
.
n_gpus
self
.
rank_start
=
rank_start
self
.
master_ip
=
master_ip
self
.
port
=
port
self
.
device_type
=
device_type
# master node create server
if
self
.
rank_start
==
0
:
self
.
server
=
Server
(
self
.
port
)
...
...
@@ -99,6 +109,7 @@ class launcher:
self
.
world_size
,
dev
+
self
.
rank_start
,
dev
,
self
.
device_type
,
args
,
kwargs
,
queue
,
...
...
This diff is collapsed.
Click to expand it.
imperative/python/src/common.cpp
浏览文件 @
dee5a10a
...
...
@@ -62,8 +62,8 @@ void init_common(py::module m) {
return
cn
.
get_mem_status_bytes
();
})
.
def
(
"create_event"
,
&
CompNode
::
create_event
,
py
::
arg
(
"flags"
)
=
0ul
)
.
def
(
"_set_default_device"
,
&
set_default_device
)
.
def
(
"_get_default_device"
,
&
get_default_device
)
.
def
_static
(
"_set_default_device"
,
&
set_default_device
)
.
def
_static
(
"_get_default_device"
,
&
get_default_device
)
.
def
(
"__str__"
,
&
CompNode
::
to_string_logical
)
.
def
(
"__repr__"
,
[](
const
CompNode
&
cn
)
{
return
py
::
str
(
"
\"
"
+
cn
.
to_string
()
+
"
\"
from
\"
"
+
cn
.
to_string_logical
()
+
"
\"
"
);
...
...
@@ -179,6 +179,10 @@ void init_common(py::module m) {
m
.
def
(
"set_prealloc_config"
,
&
CompNode
::
set_prealloc_config
,
"specifies how to pre-allocate from raw dev allocator"
);
m
.
def
(
"what_is_xpu"
,
[]{
return
CompNode
::
Locator
::
parse
(
"xpux"
).
to_physical
().
type
;
});
init_npy_num_bfloat16
(
m
);
init_npy_num_intbx
(
m
);
init_dtypes
(
m
);
...
...
This diff is collapsed.
Click to expand it.
imperative/python/test/unit/distributed/test_distributed.py
浏览文件 @
dee5a10a
...
...
@@ -16,6 +16,7 @@ import pytest
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine.core.ops.builtin
import
CollectiveComm
,
ParamPackConcat
,
ParamPackSplit
from
megengine.device
import
get_default_device
from
megengine.distributed.helper
import
(
get_device_count_by_fork
,
param_pack_concat
,
...
...
@@ -87,7 +88,8 @@ def test_new_group():
assert
group
.
size
==
2
assert
group
.
key
==
"2,0"
assert
group
.
rank
==
ranks
.
index
(
rank
)
assert
group
.
comp_node
==
"gpu{}:2"
.
format
(
rank
)
dt
=
get_default_device
()[:
-
1
]
assert
group
.
comp_node
==
"{}{}:2"
.
format
(
dt
,
rank
)
worker
()
...
...
This diff is collapsed.
Click to expand it.
imperative/python/test/unit/functional/test_functional_distributed.py
浏览文件 @
dee5a10a
...
...
@@ -236,12 +236,12 @@ def test_io_remote(shape):
def
worker
(
val
,
shape
):
rank
=
dist
.
get_rank
()
if
rank
==
0
:
# remote send
x
=
tensor
(
val
,
device
=
"
g
pu0"
)
x
=
tensor
(
val
,
device
=
"
x
pu0"
)
remote_send
(
x
,
1
)
sync
()
else
:
# remote recv
y
=
remote_recv
(
0
,
shape
,
np
.
float32
)
assert
y
.
device
==
"gpu1"
assert
y
.
device
==
get_default_device
()
np
.
testing
.
assert_almost_equal
(
val
,
y
.
numpy
())
val
=
np
.
random
.
random_sample
(
shape
).
astype
(
"float32"
)
...
...
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
反馈
建议
客服
返回
顶部