process_group.py 6.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

15 16
from collections import OrderedDict

17 18
import paddle
import paddle.fluid.core as core
19

20 21
from ..collective import _get_global_env
from ..collective import _new_ring_id
J
Jiabin Yang 已提交
22
from ...fluid.framework import _non_static_mode
23
from ...fluid.layers.tensor import fill_constant
24
from paddle.fluid.framework import _enable_legacy_dygraph
25 26 27


def get_all_process_groups():
28 29
    global _g_process_group_map
    return _g_process_group_map.values()
30 31


32
def get_process_group(group_id, g_process_group_map=None):
33
    global _g_process_group_map
34 35 36 37 38
    return (
        _g_process_group_map.get(group_id, None)
        if g_process_group_map is None
        else g_process_group_map.get(group_id, None)
    )
39 40


J
JZ-LIANG 已提交
41
def get_world_process_group():
42 43 44 45
    global _g_process_group_map
    return _g_process_group_map[0]


46 47 48
def clear_all_process_groups():
    global _g_process_group_map
    _g_process_group_map = {}
49
    _g_process_group_map[0] = ProcessGroup(1000, [])
50 51 52


def new_process_group(ranks, group_id=None):
53
    global _g_process_group_map
54
    # A key constructed from ranks is used for avoiding duplication
55
    new_key = ''.join(map(str, ranks))
56
    for pg_id, pg in _g_process_group_map.items():
57
        cur_key = ''.join(map(str, pg.ranks))
58 59 60 61 62 63
        if pg_id != 0 and new_key == cur_key:
            return pg
    # If not matching the existing one, construt a new process group
    num_groups = len(_g_process_group_map)
    # Note: our process group may interfere with the original implementation
    # so the created group id should start from the original _new_ring_id()
64 65
    if group_id is None:
        group_id = _new_ring_id() + num_groups + 1000
66

67 68 69
    new_pg = ProcessGroup(group_id, ranks)
    _g_process_group_map[group_id] = new_pg
    return new_pg
70 71 72


# This implementation refers to lots of Paddle/python/paddle/distributed/collective.py,
73
# Fleet also has a collective helper which uses ops to initialize communication in
74
# Paddle/python/paddle/distributed/fleet/meta_optimizers/common.py. We use the first one
75 76
# because it seems simple. This should be enhanced to manage the process membership and
# the instantiation process in a more general way. In the future, the process group may
77 78 79
# handle the communication implementation choice.
class ProcessGroup:
    def __init__(self, group_id, ranks):
80 81 82 83
        if group_id == 1000 and get_process_group(0) is not None:
            assert (
                group_id != 1000
            ), "Process group id 1000 is reserved for all ranks."
84
        self._group_id = group_id
85
        self._ranks = ranks
86
        # Add the current ranks into group 0
87
        if group_id != 1000:
88 89
            global _g_process_group_map
            _g_process_group_map[0].add_ranks(ranks)
90 91 92 93 94 95
        self._is_instantiate = False

    @property
    def id(self):
        return self._group_id

96 97 98 99 100 101 102 103 104 105 106 107
    @property
    def ranks(self):
        return self._ranks

    @property
    def nranks(self):
        return len(self._ranks)

    def add_ranks(self, new_ranks):
        if set(new_ranks) <= set(self.ranks):
            return
        else:
108 109 110
            assert (
                self.is_instantiate() == False
            ), "Cannot add new ranks after instantiating the process group"
111
        self._ranks.extend(new_ranks)
112
        self._ranks = list(set(self.ranks))
113 114

    def local_rank(self, global_rank):
115 116
        if global_rank in self.ranks:
            return self.ranks.index(global_rank)
117
        else:
118 119 120
            assert False, "Rank {} doesn't belong to this group".format(
                global_rank
            )
121 122 123 124 125 126 127 128 129 130 131

    def is_instantiate(self):
        return self._is_instantiate

    def instantiate(self):
        if self._is_instantiate:
            return
        ring_id = self.id
        genv = _get_global_env()
        global_rank = genv.rank

132
        if self.nranks >= 2:
133
            strategy = core.ParallelStrategy()
134
            strategy.nranks = self.nranks
135 136
            strategy.local_rank = self.local_rank(global_rank)
            strategy.trainer_endpoints = [
137
                genv.trainer_endpoints[i] for i in self.ranks
138 139 140 141 142 143
            ]
            strategy.current_endpoint = genv.current_endpoint
            strategy.nrings = 1

            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(genv.device_id)
144 145 146
                core.NCCLParallelContext(strategy, place).init_with_ring_id(
                    ring_id
                )
147
            else:
148
                assert False, "No CUDA device found"
149

150 151
            # TODO(shenliang03): This is a temporary solution to solve the problem of
            # hang caused by cross-creation of new_group
152 153
            paddle.disable_static()
            _enable_legacy_dygraph()
154 155 156 157 158 159 160 161
            paddle.set_device(
                'gpu:%d' % paddle.distributed.ParallelEnv().dev_id
            )
            tmp = (
                paddle.to_tensor([1], dtype="int32")
                if _non_static_mode()
                else fill_constant([0], dtype="int32", value="1")
            )
162
            paddle.distributed.all_reduce(tmp, sync_op=True, group=self)
163
            paddle.distributed.wait(tmp, group=self)
164

165 166 167 168 169 170 171 172 173
            # TODO(shenliang03) AlltoAll create communicator
            alltoall_tmp = paddle.empty(
                shape=[self.nranks, self.nranks], dtype="int32"
            )
            out = paddle._legacy_C_ops.alltoall(
                alltoall_tmp, 'use_calc_stream', True, 'ring_id', ring_id
            )
            paddle.device.cuda.synchronize()
            paddle.enable_static()
174 175
        self._is_instantiate = True

176 177 178
    def is_member(self):
        return True

179 180 181 182 183 184
    def __eq__(self, other):
        if not isinstance(other, ProcessGroup):
            return False
        if self.id != other.id:
            return False
        return True
185

186 187
    def __ne__(self, other):
        return not self.__eq__(other)
188

189 190
    def __str__(self):
        string = "id: {}, nranks: {}, ranks: {}.".format(
191 192
            self.id, self.nranks, ", ".join(map(str, self.ranks))
        )
193
        return string
194

195 196 197
    def __hash__(self):
        return hash(self.__str__())

198

199
# Note that Process group 0 is reserved for representing all ranks.
200
# At the beginning, group 0 is empty and new ranks will be added automatically.
201
_g_process_group_map = OrderedDict()
202
_g_process_group_map[0] = ProcessGroup(1000, [])