Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
e1fba6ec
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e1fba6ec
编写于
8月 31, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
test(mge/distributed): add get_device_count_by_fork to fix distributed test skip
GitOrigin-RevId: 9ffd8a614932c5acfff1243ceed774d3e25fbce1
上级
60076f47
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
56 addition
and
20 deletion
+56
-20
imperative/python/megengine/distributed/helper.py
imperative/python/megengine/distributed/helper.py
+16
-0
imperative/python/test/integration/test_dp_correctness.py
imperative/python/test/integration/test_dp_correctness.py
+2
-0
imperative/python/test/unit/functional/test_tensor.py
imperative/python/test/unit/functional/test_tensor.py
+24
-3
imperative/python/test/unit/test_distributed.py
imperative/python/test/unit/test_distributed.py
+5
-8
imperative/python/test/unit/test_module.py
imperative/python/test/unit/test_module.py
+9
-9
未找到文件。
imperative/python/megengine/distributed/helper.py
浏览文件 @
e1fba6ec
...
@@ -7,8 +7,11 @@
...
@@ -7,8 +7,11 @@
# software distributed under the License is distributed on an
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
functools
import
functools
import
multiprocessing
as
mp
from
typing
import
Callable
from
typing
import
Callable
from
megengine.device
import
get_device_count
from
.group
import
group_barrier
,
is_distributed
from
.group
import
group_barrier
,
is_distributed
...
@@ -26,3 +29,16 @@ def synchronized(func: Callable):
...
@@ -26,3 +29,16 @@ def synchronized(func: Callable):
return
ret
return
ret
return
wrapper
return
wrapper
def
get_device_count_by_fork
(
device_type
:
str
):
q
=
mp
.
Queue
()
def
worker
(
queue
):
num
=
get_device_count
(
device_type
)
queue
.
put
(
num
)
p
=
mp
.
Process
(
target
=
worker
,
args
=
(
q
,))
p
.
start
()
p
.
join
()
return
q
.
get
()
imperative/python/test/integration/test_dp_correctness.py
浏览文件 @
e1fba6ec
...
@@ -21,6 +21,7 @@ import megengine as mge
...
@@ -21,6 +21,7 @@ import megengine as mge
import
megengine.distributed
as
dist
import
megengine.distributed
as
dist
import
megengine.functional
as
F
import
megengine.functional
as
F
from
megengine.device
import
get_default_device
,
set_default_device
from
megengine.device
import
get_default_device
,
set_default_device
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.functional.debug_param
import
set_conv_execution_strategy
from
megengine.functional.debug_param
import
set_conv_execution_strategy
from
megengine.module
import
AvgPool2d
,
BatchNorm2d
,
Conv2d
,
Linear
,
Module
from
megengine.module
import
AvgPool2d
,
BatchNorm2d
,
Conv2d
,
Linear
,
Module
from
megengine.optimizer
import
SGD
from
megengine.optimizer
import
SGD
...
@@ -196,6 +197,7 @@ def run_test(
...
@@ -196,6 +197,7 @@ def run_test(
assert
p
.
exitcode
==
0
assert
p
.
exitcode
==
0
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
4
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"windows disable MGB_ENABLE_OPR_MM"
platform
.
system
()
==
"Windows"
,
reason
=
"windows disable MGB_ENABLE_OPR_MM"
...
...
imperative/python/test/unit/functional/test_tensor.py
浏览文件 @
e1fba6ec
...
@@ -6,6 +6,8 @@
...
@@ -6,6 +6,8 @@
# Unless required by applicable law or agreed to in writing,
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import
platform
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
...
@@ -13,6 +15,7 @@ import megengine.functional as F
...
@@ -13,6 +15,7 @@ import megengine.functional as F
from
megengine
import
Buffer
,
Parameter
,
is_cuda_available
,
tensor
from
megengine
import
Buffer
,
Parameter
,
is_cuda_available
,
tensor
from
megengine.core._trace_option
import
use_tensor_shape
from
megengine.core._trace_option
import
use_tensor_shape
from
megengine.core.tensor.utils
import
astensor1d
from
megengine.core.tensor.utils
import
astensor1d
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.test
import
assertTensorClose
from
megengine.test
import
assertTensorClose
...
@@ -323,17 +326,35 @@ def copy_test(dst, src):
...
@@ -323,17 +326,35 @@ def copy_test(dst, src):
assert
np
.
allclose
(
data
,
y
.
numpy
())
assert
np
.
allclose
(
data
,
y
.
numpy
())
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
==
0
,
reason
=
"CUDA is disabled"
)
def
test_copy_h2d
():
def
test_copy_h2d
():
copy_test
(
"cpu0"
,
"gpu0"
)
copy_test
(
"cpu0"
,
"gpu0"
)
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
==
0
,
reason
=
"CUDA is disabled"
)
def
test_copy_d2h
():
def
test_copy_d2h
():
copy_test
(
"gpu0"
,
"cpu0"
)
copy_test
(
"gpu0"
,
"cpu0"
)
@
pytest
.
mark
.
skipif
(
not
is_cuda_available
(),
reason
=
"CUDA is disabled"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
)
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
def
test_copy_d2d
():
def
test_copy_d2d
():
copy_test
(
"gpu0"
,
"gpu1"
)
copy_test
(
"gpu0"
,
"gpu1"
)
copy_test
(
"gpu0:0"
,
"gpu0:1"
)
copy_test
(
"gpu0:0"
,
"gpu0:1"
)
...
...
imperative/python/test/unit/test_distributed.py
浏览文件 @
e1fba6ec
...
@@ -14,6 +14,7 @@ import pytest
...
@@ -14,6 +14,7 @@ import pytest
import
megengine
as
mge
import
megengine
as
mge
import
megengine.distributed
as
dist
import
megengine.distributed
as
dist
from
megengine.distributed.helper
import
get_device_count_by_fork
def
_assert_q_empty
(
q
):
def
_assert_q_empty
(
q
):
...
@@ -36,6 +37,7 @@ def _assert_q_val(q, val):
...
@@ -36,6 +37,7 @@ def _assert_q_val(q, val):
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
def
test_init_process_group
():
def
test_init_process_group
():
world_size
=
2
world_size
=
2
...
@@ -43,8 +45,6 @@ def test_init_process_group():
...
@@ -43,8 +45,6 @@ def test_init_process_group():
server
=
dist
.
Server
(
port
)
server
=
dist
.
Server
(
port
)
def
worker
(
rank
,
backend
):
def
worker
(
rank
,
backend
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
,
backend
)
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
,
backend
)
assert
dist
.
is_distributed
()
==
True
assert
dist
.
is_distributed
()
==
True
assert
dist
.
get_rank
()
==
rank
assert
dist
.
get_rank
()
==
rank
...
@@ -82,6 +82,7 @@ def test_init_process_group():
...
@@ -82,6 +82,7 @@ def test_init_process_group():
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
def
test_new_group
():
def
test_new_group
():
world_size
=
3
world_size
=
3
...
@@ -90,8 +91,6 @@ def test_new_group():
...
@@ -90,8 +91,6 @@ def test_new_group():
server
=
dist
.
Server
(
port
)
server
=
dist
.
Server
(
port
)
def
worker
(
rank
):
def
worker
(
rank
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
if
rank
in
ranks
:
if
rank
in
ranks
:
group
=
dist
.
new_group
(
ranks
)
group
=
dist
.
new_group
(
ranks
)
...
@@ -117,6 +116,7 @@ def test_new_group():
...
@@ -117,6 +116,7 @@ def test_new_group():
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
def
test_group_barrier
():
def
test_group_barrier
():
world_size
=
2
world_size
=
2
...
@@ -124,8 +124,6 @@ def test_group_barrier():
...
@@ -124,8 +124,6 @@ def test_group_barrier():
server
=
dist
.
Server
(
port
)
server
=
dist
.
Server
(
port
)
def
worker
(
rank
,
q
):
def
worker
(
rank
,
q
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
group_barrier
()
dist
.
group_barrier
()
if
rank
==
0
:
if
rank
==
0
:
...
@@ -154,6 +152,7 @@ def test_group_barrier():
...
@@ -154,6 +152,7 @@ def test_group_barrier():
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
2
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
def
test_synchronized
():
def
test_synchronized
():
world_size
=
2
world_size
=
2
...
@@ -165,8 +164,6 @@ def test_synchronized():
...
@@ -165,8 +164,6 @@ def test_synchronized():
q
.
put
(
rank
)
q
.
put
(
rank
)
def
worker
(
rank
,
q
):
def
worker
(
rank
,
q
):
if
mge
.
get_device_count
(
"gpu"
)
<
world_size
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
init_process_group
(
"localhost"
,
port
,
world_size
,
rank
,
rank
)
dist
.
group_barrier
()
dist
.
group_barrier
()
if
rank
==
0
:
if
rank
==
0
:
...
...
imperative/python/test/unit/test_module.py
浏览文件 @
e1fba6ec
...
@@ -10,6 +10,14 @@ import platform
...
@@ -10,6 +10,14 @@ import platform
import
pytest
import
pytest
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine
import
tensor
from
megengine.distributed.group
import
Group
from
megengine.distributed.helper
import
get_device_count_by_fork
from
megengine.module
import
SyncBatchNorm
from
megengine.test
import
assertTensorClose
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
platform
.
system
()
==
"Darwin"
,
reason
=
"do not imp GPU mode at macos now"
...
@@ -17,6 +25,7 @@ import pytest
...
@@ -17,6 +25,7 @@ import pytest
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
platform
.
system
()
==
"Windows"
,
reason
=
"do not imp GPU mode at Windows now"
)
)
@
pytest
.
mark
.
skipif
(
get_device_count_by_fork
(
"gpu"
)
<
4
,
reason
=
"need more gpu device"
)
@
pytest
.
mark
.
isolated_distributed
@
pytest
.
mark
.
isolated_distributed
def
test_syncbn
():
def
test_syncbn
():
import
numpy
as
np
import
numpy
as
np
...
@@ -39,15 +48,6 @@ def test_syncbn():
...
@@ -39,15 +48,6 @@ def test_syncbn():
port
=
server
.
py_server_port
port
=
server
.
py_server_port
def
worker
(
rank
,
data
,
yv_expect
,
running_mean
,
running_var
):
def
worker
(
rank
,
data
,
yv_expect
,
running_mean
,
running_var
):
import
megengine
as
mge
import
megengine.distributed
as
dist
from
megengine
import
tensor
from
megengine.module
import
SyncBatchNorm
from
megengine.distributed.group
import
Group
from
megengine.test
import
assertTensorClose
if
mge
.
get_device_count
(
"gpu"
)
<
nr_ranks
:
return
dist
.
init_process_group
(
"localhost"
,
port
,
nr_ranks
,
rank
,
rank
)
dist
.
init_process_group
(
"localhost"
,
port
,
nr_ranks
,
rank
,
rank
)
group
=
Group
([
i
for
i
in
range
(
nr_ranks
)])
group
=
Group
([
i
for
i
in
range
(
nr_ranks
)])
bn
=
SyncBatchNorm
(
nr_chan
,
eps
=
eps
,
momentum
=
momentum
,
group
=
group
)
bn
=
SyncBatchNorm
(
nr_chan
,
eps
=
eps
,
momentum
=
momentum
,
group
=
group
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录