提交 e1fba6ec 编写于 作者: M Megvii Engine Team

test(mge/distributed): add get_device_count_by_fork to fix distributed test skip

GitOrigin-RevId: 9ffd8a614932c5acfff1243ceed774d3e25fbce1
上级 60076f47
...@@ -7,8 +7,11 @@ ...@@ -7,8 +7,11 @@
# software distributed under the License is distributed on an # software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import functools import functools
import multiprocessing as mp
from typing import Callable from typing import Callable
from megengine.device import get_device_count
from .group import group_barrier, is_distributed from .group import group_barrier, is_distributed
...@@ -26,3 +29,16 @@ def synchronized(func: Callable): ...@@ -26,3 +29,16 @@ def synchronized(func: Callable):
return ret return ret
return wrapper return wrapper
def get_device_count_by_fork(device_type: str):
q = mp.Queue()
def worker(queue):
num = get_device_count(device_type)
queue.put(num)
p = mp.Process(target=worker, args=(q,))
p.start()
p.join()
return q.get()
...@@ -21,6 +21,7 @@ import megengine as mge ...@@ -21,6 +21,7 @@ import megengine as mge
import megengine.distributed as dist import megengine.distributed as dist
import megengine.functional as F import megengine.functional as F
from megengine.device import get_default_device, set_default_device from megengine.device import get_default_device, set_default_device
from megengine.distributed.helper import get_device_count_by_fork
from megengine.functional.debug_param import set_conv_execution_strategy from megengine.functional.debug_param import set_conv_execution_strategy
from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
from megengine.optimizer import SGD from megengine.optimizer import SGD
...@@ -196,6 +197,7 @@ def run_test( ...@@ -196,6 +197,7 @@ def run_test(
assert p.exitcode == 0 assert p.exitcode == 0
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 4, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM" platform.system() == "Windows", reason="windows disable MGB_ENABLE_OPR_MM"
......
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
# Unless required by applicable law or agreed to in writing, # Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an # software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import platform
import numpy as np import numpy as np
import pytest import pytest
...@@ -13,6 +15,7 @@ import megengine.functional as F ...@@ -13,6 +15,7 @@ import megengine.functional as F
from megengine import Buffer, Parameter, is_cuda_available, tensor from megengine import Buffer, Parameter, is_cuda_available, tensor
from megengine.core._trace_option import use_tensor_shape from megengine.core._trace_option import use_tensor_shape
from megengine.core.tensor.utils import astensor1d from megengine.core.tensor.utils import astensor1d
from megengine.distributed.helper import get_device_count_by_fork
from megengine.test import assertTensorClose from megengine.test import assertTensorClose
...@@ -323,17 +326,35 @@ def copy_test(dst, src): ...@@ -323,17 +326,35 @@ def copy_test(dst, src):
assert np.allclose(data, y.numpy()) assert np.allclose(data, y.numpy())
@pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") @pytest.mark.skipif(
platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
)
@pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
)
@pytest.mark.skipif(get_device_count_by_fork("gpu") == 0, reason="CUDA is disabled")
def test_copy_h2d(): def test_copy_h2d():
copy_test("cpu0", "gpu0") copy_test("cpu0", "gpu0")
@pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") @pytest.mark.skipif(
platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
)
@pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
)
@pytest.mark.skipif(get_device_count_by_fork("gpu") == 0, reason="CUDA is disabled")
def test_copy_d2h(): def test_copy_d2h():
copy_test("gpu0", "cpu0") copy_test("gpu0", "cpu0")
@pytest.mark.skipif(not is_cuda_available(), reason="CUDA is disabled") @pytest.mark.skipif(
platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
)
@pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
)
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device")
def test_copy_d2d(): def test_copy_d2d():
copy_test("gpu0", "gpu1") copy_test("gpu0", "gpu1")
copy_test("gpu0:0", "gpu0:1") copy_test("gpu0:0", "gpu0:1")
......
...@@ -14,6 +14,7 @@ import pytest ...@@ -14,6 +14,7 @@ import pytest
import megengine as mge import megengine as mge
import megengine.distributed as dist import megengine.distributed as dist
from megengine.distributed.helper import get_device_count_by_fork
def _assert_q_empty(q): def _assert_q_empty(q):
...@@ -36,6 +37,7 @@ def _assert_q_val(q, val): ...@@ -36,6 +37,7 @@ def _assert_q_val(q, val):
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now" platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
) )
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_init_process_group(): def test_init_process_group():
world_size = 2 world_size = 2
...@@ -43,8 +45,6 @@ def test_init_process_group(): ...@@ -43,8 +45,6 @@ def test_init_process_group():
server = dist.Server(port) server = dist.Server(port)
def worker(rank, backend): def worker(rank, backend):
if mge.get_device_count("gpu") < world_size:
return
dist.init_process_group("localhost", port, world_size, rank, rank, backend) dist.init_process_group("localhost", port, world_size, rank, rank, backend)
assert dist.is_distributed() == True assert dist.is_distributed() == True
assert dist.get_rank() == rank assert dist.get_rank() == rank
...@@ -82,6 +82,7 @@ def test_init_process_group(): ...@@ -82,6 +82,7 @@ def test_init_process_group():
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now" platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
) )
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_new_group(): def test_new_group():
world_size = 3 world_size = 3
...@@ -90,8 +91,6 @@ def test_new_group(): ...@@ -90,8 +91,6 @@ def test_new_group():
server = dist.Server(port) server = dist.Server(port)
def worker(rank): def worker(rank):
if mge.get_device_count("gpu") < world_size:
return
dist.init_process_group("localhost", port, world_size, rank, rank) dist.init_process_group("localhost", port, world_size, rank, rank)
if rank in ranks: if rank in ranks:
group = dist.new_group(ranks) group = dist.new_group(ranks)
...@@ -117,6 +116,7 @@ def test_new_group(): ...@@ -117,6 +116,7 @@ def test_new_group():
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now" platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
) )
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_group_barrier(): def test_group_barrier():
world_size = 2 world_size = 2
...@@ -124,8 +124,6 @@ def test_group_barrier(): ...@@ -124,8 +124,6 @@ def test_group_barrier():
server = dist.Server(port) server = dist.Server(port)
def worker(rank, q): def worker(rank, q):
if mge.get_device_count("gpu") < world_size:
return
dist.init_process_group("localhost", port, world_size, rank, rank) dist.init_process_group("localhost", port, world_size, rank, rank)
dist.group_barrier() dist.group_barrier()
if rank == 0: if rank == 0:
...@@ -154,6 +152,7 @@ def test_group_barrier(): ...@@ -154,6 +152,7 @@ def test_group_barrier():
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now" platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
) )
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 2, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_synchronized(): def test_synchronized():
world_size = 2 world_size = 2
...@@ -165,8 +164,6 @@ def test_synchronized(): ...@@ -165,8 +164,6 @@ def test_synchronized():
q.put(rank) q.put(rank)
def worker(rank, q): def worker(rank, q):
if mge.get_device_count("gpu") < world_size:
return
dist.init_process_group("localhost", port, world_size, rank, rank) dist.init_process_group("localhost", port, world_size, rank, rank)
dist.group_barrier() dist.group_barrier()
if rank == 0: if rank == 0:
......
...@@ -10,6 +10,14 @@ import platform ...@@ -10,6 +10,14 @@ import platform
import pytest import pytest
import megengine as mge
import megengine.distributed as dist
from megengine import tensor
from megengine.distributed.group import Group
from megengine.distributed.helper import get_device_count_by_fork
from megengine.module import SyncBatchNorm
from megengine.test import assertTensorClose
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Darwin", reason="do not imp GPU mode at macos now" platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
...@@ -17,6 +25,7 @@ import pytest ...@@ -17,6 +25,7 @@ import pytest
@pytest.mark.skipif( @pytest.mark.skipif(
platform.system() == "Windows", reason="do not imp GPU mode at Windows now" platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
) )
@pytest.mark.skipif(get_device_count_by_fork("gpu") < 4, reason="need more gpu device")
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_syncbn(): def test_syncbn():
import numpy as np import numpy as np
...@@ -39,15 +48,6 @@ def test_syncbn(): ...@@ -39,15 +48,6 @@ def test_syncbn():
port = server.py_server_port port = server.py_server_port
def worker(rank, data, yv_expect, running_mean, running_var): def worker(rank, data, yv_expect, running_mean, running_var):
import megengine as mge
import megengine.distributed as dist
from megengine import tensor
from megengine.module import SyncBatchNorm
from megengine.distributed.group import Group
from megengine.test import assertTensorClose
if mge.get_device_count("gpu") < nr_ranks:
return
dist.init_process_group("localhost", port, nr_ranks, rank, rank) dist.init_process_group("localhost", port, nr_ranks, rank, rank)
group = Group([i for i in range(nr_ranks)]) group = Group([i for i in range(nr_ranks)])
bn = SyncBatchNorm(nr_chan, eps=eps, momentum=momentum, group=group) bn = SyncBatchNorm(nr_chan, eps=eps, momentum=momentum, group=group)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册