diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index c0feadb68838d62fc9adad6d82877498d611a22b..8b1b6dc0ff8588a6b6af670b289e45a0cf014677 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -160,6 +160,46 @@ def get_group(id=0): return gm[group] if group in gm else None +def barrier(group=None): + """ + + Barrier among all participators in the group. + + Args: + group (Group): The group instance return by new_group or None for global default group. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle + from paddle.distributed import init_parallel_env + + paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id) + init_parallel_env() + paddle.distributed.barrier() + """ + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + + op_type = 'barrier' + temp = fill_constant([1], dtype="int32", value="1") + if in_dygraph_mode(): + return core.ops.barrier(temp, temp, 'ring_id', ring_id) + if not isinstance(ring_id, int): + raise ValueError("The type of 'group' for barrier must be int.") + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [temp]}, + outputs={'Out': [temp]}, + attrs={'ring_id': ring_id}) + + def new_group(ranks=None, backend=None): """ @@ -220,7 +260,8 @@ def new_group(ranks=None, backend=None): core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id) else: assert False, ("no cuda device found") - + # need to barrier to construct group + barrier(gp) return gp @@ -838,46 +879,6 @@ def _mp_allreduce(tensor, raise NotImplementedError("No support _mp_allreduce in dygraph mode.") -def barrier(group=None): - """ - - Barrier among all participators in the group. - - Args: - group (Group): The group instance return by new_group or None for global default group. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle - from paddle.distributed import init_parallel_env - - paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id) - init_parallel_env() - paddle.distributed.barrier() - """ - if group is not None and not group.is_member(): - return - - ring_id = 0 if group is None else group.id - - op_type = 'barrier' - temp = fill_constant([1], dtype="int32", value="1") - if in_dygraph_mode(): - return core.ops.barrier(temp, temp, 'ring_id', ring_id) - if not isinstance(ring_id, int): - raise ValueError("The type of 'group' for barrier must be int.") - helper = LayerHelper(op_type, **locals()) - helper.append_op( - type=op_type, - inputs={'X': [temp]}, - outputs={'Out': [temp]}, - attrs={'ring_id': ring_id}) - - def _parallel_linear(x, num_rows, num_cols,