Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
e1fba6ec
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e1fba6ec
编写于
8月 31, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
test(mge/distributed): add get_device_count_by_fork to fix distributed test skip
GitOrigin-RevId: 9ffd8a614932c5acfff1243ceed774d3e25fbce1
上级
60076f47
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
56 addition
and
20 deletion
+56
-20
imperative/python/megengine/distributed/helper.py
imperative/python/megengine/distributed/helper.py
+16
-0
imperative/python/test/integration/test_dp_correctness.py
imperative/python/test/integration/test_dp_correctness.py
+2
-0
imperative/python/test/unit/functional/test_tensor.py
imperative/python/test/unit/functional/test_tensor.py
+24
-3
imperative/python/test/unit/test_distributed.py
imperative/python/test/unit/test_distributed.py
+5
-8
imperative/python/test/unit/test_module.py
imperative/python/test/unit/test_module.py
+9
-9
未找到文件。
imperative/python/megengine/distributed/helper.py
浏览文件 @
e1fba6ec
...
...
@@ -7,8 +7,11 @@
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
functools
import
multiprocessing
as
mp
from
typing
import
Callable
from
megengine.device
import
get_device_count
from
.group
import
group_barrier
,
is_distributed
...
...
@@ -26,3 +29,16 @@ def synchronized(func: Callable):
return
ret
return
wrapper
def
get_device_count_by_fork
(
device_type
:
str
):
q
=
mp
.
Queue
()
def
worker
(
queue
):
num
=
get_device_count
(
device_type
)
queue
.
put
(
num
)
p
=
mp
.
Process
(
target
=
worker
,
args
=
(
q
,))
p
.
start
()
p
.
join
()
return
q
.
get
()
imperative/python/test/integration/test_dp_correctness.py
浏览文件 @
e1fba6ec
...
...
@@ -21,6 +21,7 @@ import megengine as mge
import
megengine.distributed
as
dist
import
megengine.functional
as
F
from
megengine.device
import
get_default_device
,
set_default_device
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.functional.debug_param
import
set_conv_execution_strategy
from
megengine.module
import
AvgPool2d
,
BatchNorm2d
,
Conv2d
,
Linear
,
Module
from
megengine.optimizer
import
SGD
...
...
@@ -196,6 +197,7 @@ def run_test(
assert
p
.
exitcode
==
0
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
4
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"windows disable MGB_ENABLE_OPR_MM"
...
...
imperative/python/test/unit/functional/test_tensor.py
浏览文件 @
e1fba6ec
...
...
@@ -6,6 +6,8 @@
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
platform
import
numpy
as
np
import
pytest
...
...
@@ -13,6 +15,7 @@ import megengine.functional as F
from
megengine
import
Buffer
,
Parameter
,
is_cuda_available
,
tensor
from
megengine.core._trace_option
import
use_tensor_shape
from
megengine.core.tensor.utils
import
astensor1d
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.test
import
assertTensorClose
...
...
@@ -323,17 +326,35 @@ def copy_test(dst, src):
assert
np
.
allclose
(
data
,
y
.
numpy
())
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
==
0
,
reason
=
"CUDA is disabled"
)
def
test_copy_h2d
():
copy_test
(
"cpu0"
,
"gpu0"
)
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
==
0
,
reason
=
"CUDA is disabled"
)
def
test_copy_d2h
():
copy_test
(
"gpu0"
,
"cpu0"
)
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
def
test_copy_d2d
():
copy_test
(
"gpu0"
,
"gpu1"
)
copy_test
(
"gpu0:0"
,
"gpu0:1"
)
...
...
imperative/python/test/unit/test_distributed.py
浏览文件 @
e1fba6ec
...
...
@@ -14,6 +14,7 @@ import pytest
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine.distributed.helper
import
get_device_count_by_fork
def
_assert_q_empty
(
q
):
...
...
@@ -36,6 +37,7 @@ def _assert_q_val(q, val):
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
def
test_init_process_group
():
world_size
=
2
...
...
@@ -43,8 +45,6 @@ def test_init_process_group():
server
=
dist
.
Server
(
port
)
def
worker
(
rank
,
backend
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
,
backend
)
assert
dist
.
is_distributed
()
==
True
assert
dist
.
get_rank
()
==
rank
...
...
@@ -82,6 +82,7 @@ def test_init_process_group():
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
def
test_new_group
():
world_size
=
3
...
...
@@ -90,8 +91,6 @@ def test_new_group():
server
=
dist
.
Server
(
port
)
def
worker
(
rank
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
if
rank
in
ranks
:
group
=
dist
.
new_group
(
ranks
)
...
...
@@ -117,6 +116,7 @@ def test_new_group():
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
def
test_group_barrier
():
world_size
=
2
...
...
@@ -124,8 +124,6 @@ def test_group_barrier():
server
=
dist
.
Server
(
port
)
def
worker
(
rank
,
q
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
group_barrier
()
if
rank
==
0
:
...
...
@@ -154,6 +152,7 @@ def test_group_barrier():
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
def
test_synchronized
():
world_size
=
2
...
...
@@ -165,8 +164,6 @@ def test_synchronized():
q
.
put
(
rank
)
def
worker
(
rank
,
q
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
group_barrier
()
if
rank
==
0
:
...
...
imperative/python/test/unit/test_module.py
浏览文件 @
e1fba6ec
...
...
@@ -10,6 +10,14 @@ import platform
import
pytest
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine
import
tensor
from
megengine.distributed.group
import
Group
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.module
import
SyncBatchNorm
from
megengine.test
import
assertTensorClose
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
...
...
@@ -17,6 +25,7 @@ import pytest
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
4
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
def
test_syncbn
():
import
numpy
as
np
...
...
@@ -39,15 +48,6 @@ def test_syncbn():
port
=
server
.
py_server_port
def
worker
(
rank
,
data
,
yv_expect
,
running_mean
,
running_var
):
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine
import
tensor
from
megengine.module
import
SyncBatchNorm
from
megengine.distributed.group
import
Group
from
megengine.test
import
assertTensorClose
if
mge
.
get_device_count
(
"gpu"
)
<
nr_ranks
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
nr_ranks
,
rank
,
rank
)
group
=
Group
([
i
for
i
in
range
(
nr_ranks
)])
bn
=
SyncBatchNorm
(
nr_chan
,
eps
=
eps
,
momentum
=
momentum
,
group
=
group
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录