launcher.py 4.6 KB
Newer Older
1 2 3
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
4
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
5 6 7 8
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
import functools
10
import multiprocessing as mp
11
import queue
12

13
from ..core._imperative_rt.core2 import sync
14
from ..logger import get_logger
15
from .group import group_barrier, init_process_group
16
from .helper import get_device_count_by_fork
17
from .server import Client, Server
18

19 20 21 22
WARN_SUBPROCESS_EXIT_WITHOUT_RETURN = (
    "subprocess exited with code 0 but did not return a value"
)

23

24
def _run_wrapped(
25 26 27 28 29 30 31 32 33 34
    func,
    is_multimachine,
    master_ip,
    port,
    world_size,
    rank,
    dev,
    args,
    kwargs,
    queue: mp.Queue,
35
):
M
Megvii Engine Team 已提交
36
    """Init distributed process group and run wrapped function."""
37 38 39
    init_process_group(
        master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
    )
40 41
    if is_multimachine:
        group_barrier()
42 43
    ret = func(*args, **kwargs)
    queue.put((dev, ret))
44 45 46
    sync()
    if is_multimachine:
        group_barrier()
47 48


49 50
class launcher:
    """Decorator for launching multiple processes in single-machine multi-gpu training.
51

52 53 54 55 56 57 58
    :param func: the function you want to launch in distributed mode.
    :param n_gpus: how many devices each node.
    :param world_size: how many devices totally.
    :param rank_start: start number for rank.
    :param master_ip: ip address for master node (where the rank 0 is).
    :param port: server port for distributed server.
    """
59

60 61 62 63
    def __new__(cls, *args, **kwargs):
        if not args:
            return functools.partial(cls, **kwargs)
        return super().__new__(cls)
64

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
    def __init__(
        self,
        func,
        n_gpus=None,
        world_size=None,
        rank_start=0,
        master_ip="localhost",
        port=0,
    ):
        self.func = func
        self.n_gpus = n_gpus if n_gpus is not None else get_device_count_by_fork("gpu")
        self.world_size = world_size if world_size is not None else self.n_gpus
        self.rank_start = rank_start
        self.master_ip = master_ip
        self.port = port
        # master node create server
        if self.rank_start == 0:
            self.server = Server(self.port)
            self.port = self.server.py_server_port
        else:
            assert self.port != 0, "you have to assign a port for distributed server"
86

87
    def __call__(self, *args, **kwargs):
88
        procs = []
89 90
        queue = mp.Queue(self.n_gpus)
        results = [None] * self.n_gpus
91
        for dev in range(self.n_gpus):
92 93
            p = mp.Process(
                target=_run_wrapped,
94 95 96 97 98 99 100 101 102 103
                args=(
                    self.func,
                    self.world_size > self.n_gpus,
                    self.master_ip,
                    self.port,
                    self.world_size,
                    dev + self.rank_start,
                    dev,
                    args,
                    kwargs,
104
                    queue,
105
                ),
106 107 108
            )
            p.start()
            procs.append(p)
109

110
        devs = list(range(self.n_gpus))
111

112 113 114 115 116
        def terminate():
            for dev in devs:
                procs[dev].terminate()
            devs.clear()

117
        result_count = 0
118
        while len(devs) > 0:
119
            left = []
120
            # check all processes in one second
121 122 123 124
            time_to_wait = 1.0 / len(devs)
            for dev in devs:
                procs[dev].join(time_to_wait)
                code = procs[dev].exitcode
125 126
                # terminate processes if one of them has failed
                if code != 0 and code != None:
127
                    terminate()
128 129
                assert (
                    code == 0 or code == None
130
                ), "subprocess {} exit with code {}".format(dev + self.rank_start, code)
131
                if code == None:
132
                    left.append(dev)
133 134 135 136 137

                # DO NOT delete it, multiprocess.Queue has small buffer
                # fetch data early to avoid dead lock
                if not queue.empty():
                    result_count += 1
138 139
                    dev, ret = queue.get_nowait()
                    results[dev] = ret
140
            devs = left
141

142 143 144 145 146 147 148 149
        while not queue.empty():
            result_count += 1
            dev, ret = queue.get_nowait()
            results[dev] = ret

        if result_count < self.n_gpus:
            get_logger().warning(WARN_SUBPROCESS_EXIT_WITHOUT_RETURN)

150
        return results