未验证 提交 99504cbb 编写于 作者: L LiYuRio 提交者: GitHub

move broadcast, reduce, send, recv, reduce_scatter, scatter, alltoall (#47255)

上级 ef67c8a8
......@@ -233,6 +233,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& opts) {
return Broadcast(inputs, outputs, opts, true);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& opts,
bool sync_op) {
auto root = opts.source_rank;
std::unique_ptr<BroadcastGlooTask> task;
auto tag = next_tag();
......@@ -442,6 +450,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const ReduceOptions& opts) {
return Reduce(inputs, outputs, opts, true);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const ReduceOptions& opts,
bool sync_op) {
std::shared_ptr<ReduceGlooTask> task;
auto tag = next_tag();
auto context = get_context();
......@@ -497,6 +513,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts) {
return Scatter(in_tensors, out_tensors, opts, true);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions& opts,
bool sync_op) {
std::shared_ptr<ScatterGlooTask> task;
auto tag = next_tag();
auto context = get_context();
......
......@@ -113,6 +113,12 @@ class ProcessGroupGloo : public ProcessGroup {
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
const BroadcastOptions& opts,
bool sync_op) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& inputs,
std::vector<phi::DenseTensor>& outputs,
......@@ -131,11 +137,23 @@ class ProcessGroupGloo : public ProcessGroup {
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts,
bool sync_op) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions&,
bool sync_op) override;
std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
......
......@@ -27,31 +27,33 @@ from paddle.distributed.fleet.dataset import InMemoryDataset # noqa: F401
from paddle.distributed.fleet.dataset import QueueDataset # noqa: F401
from paddle.distributed.fleet.base.topology import ParallelMode # noqa: F401
from .collective import broadcast # noqa: F401
from .collective import all_reduce # noqa: F401
from .collective import reduce # noqa: F401
from .collective import all_gather # noqa: F401
from .collective import all_gather_object # noqa: F401
from .collective import scatter # noqa: F401
from .collective import barrier # noqa: F401
from .collective import ReduceOp # noqa: F401
from .collective import split # noqa: F401
from .collective import new_group # noqa: F401
from .collective import alltoall # noqa: F401
from .collective import recv # noqa: F401
from .collective import get_group # noqa: F401
from .collective import send # noqa: F401
from .collective import wait # noqa: F401
from .collective import is_initialized # noqa: F401
from .collective import destroy_process_group # noqa: F401
from .collective import alltoall_single # noqa: F401
from .collective import isend # noqa: F401
from .collective import irecv # noqa: F401
from .collective import batch_isend_irecv # noqa: F401
from .collective import P2POp # noqa: F401
from .collective import reduce_scatter # noqa: F401
from .communication import stream # noqa: F401
from .communication import (
stream,
ReduceOp,
all_reduce,
alltoall,
alltoall_single,
broadcast,
reduce,
send,
scatter,
isend,
recv,
irecv,
batch_isend_irecv,
P2POp,
reduce_scatter,
is_initialized,
destroy_process_group,
get_group,
) # noqa: F401
from .auto_parallel import shard_op # noqa: F401
from .auto_parallel import shard_tensor # noqa: F401
......@@ -109,5 +111,4 @@ __all__ = [ # noqa
"irecv",
"reduce_scatter",
"rpc",
"stream",
]
......@@ -17,7 +17,6 @@ import os
import pickle
import io
import datetime
import time
from ..fluid.layer_helper import LayerHelper
from ..fluid.framework import in_dygraph_mode
from ..fluid.framework import _non_static_mode
......@@ -26,7 +25,6 @@ from ..fluid.layers.tensor import fill_constant
import paddle
import paddle.fluid.core as core
from paddle import _legacy_C_ops
import contextlib
from .fleet.layers.mpu.mp_ops import split # noqa: F401
from .fleet.layers.mpu.mp_ops import _c_identity # noqa: F401
from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401
......@@ -39,9 +37,7 @@ from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy # noqa: F401
from .fleet.layers.mpu.mp_ops import _linear # noqa: F401
from .fleet.layers.mpu.mp_ops import _parallel_linear # noqa: F401
from .fleet.layers.mpu.mp_ops import _parallel_embedding # noqa: F401
from .communication.group import Group, _add_new_group
from .communication.all_reduce import all_reduce # noqa: F401
from .communication.reduce import _get_reduce_op, ReduceOp
from .communication.group import Group, _add_new_group, is_initialized
__all__ = []
......@@ -144,30 +140,6 @@ def _new_ring_id():
return len(_get_group_map()) + max(_get_global_env().nrings, 9)
def get_group(id=0):
"""
Get group instance by group id.
Args:
id (int): the group id. Default value is 0.
Returns:
Group: the group instance.
Examples:
.. code-block:: python
...
gid = paddle.distributed.new_group([2,4,6])
paddle.distributed.get_group(gid.id)
"""
gm = _get_group_map()
return gm[id] if id in gm else None
def _new_process_group_impl(
backend,
store,
......@@ -298,45 +270,6 @@ def _set_custom_gid(gid):
_custom_gid = gid
def _barrier_by_tcp_store(group_name, store, timeout):
global_rank = paddle.distributed.get_rank()
global_world_size = paddle.distributed.get_world_size()
if global_world_size < 2:
return
barrier_prefix = "Barrier/" + group_name + "/"
is_master = global_rank == 0
def _check_keys_ready(wait_keys):
start_time = time.time()
while len(wait_keys) > 0:
time.sleep(0.1)
elapse_time = time.time() - start_time
if datetime.timedelta(seconds=elapse_time) > timeout:
raise RuntimeError(
"Timeout while initializing process group {}."
"Keys {} are not ready sinck rank {} is waiting them."
"Two reason may cause this error:\n 1. The create process group api should be called by all ranks.\n"
" 2. Try to increase the waiting time.\n".format(
group_name, wait_keys, global_rank
)
)
wait_keys = list(
filter(lambda key: int(store.get(key)) != 1, wait_keys)
)
# all the workers set their exiting key and exit
# the master will wait for all workers' exiting key, ensure to exit in the end
if is_master:
wait_keys = [
barrier_prefix + str(rank) for rank in range(1, global_world_size)
]
_check_keys_ready(wait_keys)
else:
store.add(barrier_prefix + str(global_rank), 1)
def new_group(ranks=None, backend=None, timeout=_default_timeout):
"""
......@@ -479,77 +412,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
return gp
def is_initialized():
"""
Check whether the distributed environment has been initialized
Returns:
`True` if distributed environment has been initialized, otherwise `False`.
Examples:
.. code-block:: python
# required: distributed
import paddle
print(paddle.distributed.is_initialized())
# False
paddle.distributed.init_parallel_env()
print(paddle.distributed.is_initialized())
# True
"""
global _group_map_by_name
return _default_group_name in _group_map_by_name
def destroy_process_group(group=None):
"""
Destroy a given group for communication
Args:
group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
the default group, will be destroyed and the distributed
environment will be deinitialized.
Returns : None
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
group = dist.new_group([0, 1])
dist.destroy_process_group(group)
print(dist.is_initialized())
# True
dist.destroy_process_group()
print(dist.is_initialized())
# False
"""
global _group_map
global _group_map_by_name
pg = _get_default_group() if group is None else group
assert _group_map.get(pg.id, None) is not None, "Invalid group."
if group is None:
_group_map.clear()
_group_map_by_name.clear()
_group_map_backend.clear()
else:
del _group_map[pg.id]
del _group_map_by_name[pg.name]
del _group_map_backend[pg]
def wait(tensor, group=None, use_calc_stream=True):
"""
......@@ -620,256 +482,6 @@ def _sync_comm_stream(tensor, ring_id=0):
)
def broadcast(tensor, src, group=None, sync_op=True):
"""
Broadcast a tensor from the source to all others.
As shown below, one process is started with a GPU and GPU0 owns data 0. Through broadcast operator,
data 0 will be sent to all GPUs from GPU0.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/broadcast.png
:width: 800
:alt: broadcast
:align: center
Args:
tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.broadcast(data, src=1)
print(data)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if group is not None and not group.is_member():
return
if not isinstance(src, int):
raise ValueError("src should be int.")
if in_dygraph_mode():
group = _get_default_group() if group is None else group
gsrc = group.get_group_rank(src)
assert gsrc >= 0, "src rank out of group, need global rank"
task = group.process_group.broadcast(tensor, gsrc)
if sync_op:
task.wait()
return None
else:
return task
use_calc_stream = sync_op
ring_id = ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
assert gsrc >= 0, "src rank out of group, need global rank"
if _non_static_mode():
return _legacy_C_ops.c_broadcast(
tensor,
tensor,
'root',
gsrc,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
)
op_type = 'c_broadcast'
check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'broadcast',
)
helper = LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
outputs={'Out': [tensor]},
attrs={
'root': gsrc,
'use_calc_stream': use_calc_stream,
'ring_id': ring_id,
},
)
def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
"""
Reduce a tensor to the destination from all others. As shown below, one process is started with a GPU and the data of this process is represented
by its group rank. The destination of the reduce operator is GPU0 and the process is sum. Through reduce operator,
the GPU0 will owns the sum of all data from all GPUs.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/reduce.png
:width: 800
:alt: reduce
:align: center
Args:
tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.reduce(data, dst=0)
print(data)
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
return
if in_dygraph_mode():
op_type = _get_reduce_op(op, "reduce")
group = _get_default_group() if group is None else group
gdst = group.get_group_rank(dst)
assert gdst >= 0, "dst rank out of group, need global rank"
task = group.process_group.reduce(tensor, gdst, op_type)
if sync_op:
task.wait()
return None
else:
return task
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
gdst = dst if group is None else group.get_group_rank(dst)
assert gdst >= 0, "dst rank out of group, need global rank"
if _non_static_mode():
if op == ReduceOp.SUM:
return _legacy_C_ops.c_reduce_sum(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.MAX:
return _legacy_C_ops.c_reduce_max(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.MIN:
return _legacy_C_ops.c_reduce_min(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.PROD:
return _legacy_C_ops.c_reduce_prod(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
else:
raise ValueError("Unknown parameter: {}.".format(op))
op_type = 'c_reduce'
check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'reduce',
)
if op == ReduceOp.SUM:
op_type = 'c_reduce_sum'
elif op == ReduceOp.MAX:
op_type = 'c_reduce_max'
elif op == ReduceOp.MIN:
op_type = 'c_reduce_min'
elif op == ReduceOp.PROD:
op_type = 'c_reduce_prod'
helper = LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'use_calc_stream': use_calc_stream,
'root_id': gdst,
},
)
def all_gather(tensor_list, tensor, group=None, sync_op=True):
"""
......@@ -1090,873 +702,3 @@ def all_gather_object(object_list, obj, group=None):
object_list.append(
_convert_tensor_to_object(tensor, list_len_of_tensor[i])
)
def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
"""
Scatter a tensor to all participators. As shown below, one process is started with a GPU and the source of the scatter
is GPU0. Through scatter operator, the data in GPU0 will be sent to all GPUs averagely.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/scatter.png
:width: 800
:alt: scatter
:align: center
Args:
tensor (Tensor): The output Tensor. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
src (int): The source rank id. Default value is 0.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([7, 8, 9])
data2 = paddle.to_tensor([10, 11, 12])
dist.scatter(data1, src=1)
else:
data1 = paddle.to_tensor([1, 2, 3])
data2 = paddle.to_tensor([4, 5, 6])
dist.scatter(data1, tensor_list=[data1, data2], src=1)
print(data1, data2)
# [1, 2, 3] [10, 11, 12] (2 GPUs, out for rank 0)
# [4, 5, 6] [4, 5, 6] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
return
if not isinstance(src, int):
raise ValueError("src should be int.")
if in_dygraph_mode():
group = _get_default_group() if group is None else group
gsrc = group.get_group_rank(src)
rank = group.rank
nranks = group.nranks
else:
ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
rank = _get_global_group().rank if group is None else group.rank
nranks = _get_global_group().nranks if group is None else group.nranks
assert gsrc >= 0, "src rank out of group, need global rank"
if rank != gsrc:
tensor_list = []
for _ in range(nranks):
tensor_list.append(tensor)
temp = paddle.concat(tensor_list, axis=0)
if in_dygraph_mode():
task = group.process_group.scatter(temp, tensor, gsrc)
if sync_op:
task.wait()
return None
else:
return task
use_calc_stream = sync_op
if _non_static_mode():
return _legacy_C_ops.c_scatter(
temp,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'nranks',
nranks,
'root',
gsrc,
)
op_type = 'c_scatter'
check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'scatter',
)
helper = LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [temp]},
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'root': gsrc,
'use_calc_stream': use_calc_stream,
'nranks': nranks,
},
)
def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
"""
Scatter tensors in in_tensor_list to all participators averagely and gather the result tensors in out_tensor_list.
As shown below, the in_tensor_list in GPU0 includes 0_0 and 0_1, and GPU1 includes 1_0 and 1_1.
Through alltoall operator, the 0_0 in GPU0 will be sent to GPU0 and 0_1 to GPU1, 1_0 in GPU1 sent to GPU0 and 1_1 to GPU1.
Finally the out_tensor_list in GPU0 includes 0_0 and 1_0, and GPU1 includes 0_1 and 1_1.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/alltoall.png
:width: 800
:alt: alltoall
:align: center
Args:
in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
data type of the input Tensors.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
out_tensor_list = []
if dist.get_rank() == 0:
data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
else:
data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]])
data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]])
dist.alltoall([data1, data2], out_tensor_list)
print(out_tensor_list)
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
return
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
else:
ring_id = 0 if group is None else group.id
temp = paddle.concat(in_tensor_list, axis=0)
nranks = len(in_tensor_list)
if in_dygraph_mode():
if len(out_tensor_list) == 0:
tensor_shape = list(in_tensor_list[0].shape)
tensor_shape[0] *= nranks
out = paddle.empty(tensor_shape, in_tensor_list[0].dtype)
else:
out = paddle.concat(out_tensor_list, axis=0)
task = group.process_group.alltoall(temp, out)
task.wait()
out_tensor_list.clear()
out_tensor_list.extend(paddle.split(out, nranks, 0))
return
use_calc_stream = sync_op
if _non_static_mode():
out = _legacy_C_ops.alltoall(
temp, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
else:
op_type = 'alltoall'
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(
dtype=in_tensor_list[0].dtype
)
if not isinstance(in_tensor_list, list):
raise ValueError(
"The type of 'in_tensor_list' for all_to_all " "should be list."
)
for elem in in_tensor_list:
check_variable_and_dtype(
elem,
'in_tensor_list',
['float16', 'float32', 'float64', 'int32', 'int64'],
'all_to_all',
)
if not isinstance(out_tensor_list, list):
raise ValueError(
"The type of 'out_tensor_list' for all_to_all "
"should be list."
)
if len(out_tensor_list) != 0:
raise ValueError(
"The 'out_tensor_list' for all_to_all " "must be an empty list."
)
helper.append_op(
type=op_type,
inputs={'X': [temp]},
outputs={'Out': [out]},
attrs={
'ring_id': ring_id,
'use_calc_stream': use_calc_stream,
},
)
out_tensor_list.extend(paddle.split(out, nranks, 0))
def alltoall_single(
in_tensor,
out_tensor,
in_split_sizes=None,
out_split_sizes=None,
group=None,
sync_op=True,
):
"""
Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
Note:
``alltoall_single`` is only supported in eager mode.
Args:
in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None, if ``sync_op`` is set to ``True``; ``Task`` of ``group``, if ``sync_op`` is set to ``False``.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
size = dist.get_world_size()
# case 1 (2 GPUs)
data = paddle.arange(2, dtype='int64') + rank * 2
# data for rank 0: [0, 1]
# data for rank 1: [2, 3]
output = paddle.empty([2], dtype='int64')
dist.alltoall_single(data, output)
print(output)
# output for rank 0: [0, 2]
# output for rank 1: [1, 3]
# case 2 (2 GPUs)
in_split_sizes = [i + 1 for i in range(size)]
# in_split_sizes for rank 0: [1, 2]
# in_split_sizes for rank 1: [1, 2]
out_split_sizes = [rank + 1 for i in range(size)]
# out_split_sizes for rank 0: [1, 1]
# out_split_sizes for rank 1: [2, 2]
data = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
# data for rank 0: [[0., 0.], [0., 0.], [0., 0.]]
# data for rank 1: [[1., 1.], [1., 1.], [1., 1.]]
output = paddle.empty([(rank + 1) * size, size], dtype='float32')
group = dist.new_group([0, 1])
task = dist.alltoall_single(data,
output,
in_split_sizes,
out_split_sizes,
sync_op=False,
group=group)
task.wait()
print(output)
# output for rank 0: [[0., 0.], [1., 1.]]
# output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
"""
if group is not None and not group.is_member():
return
assert in_dygraph_mode(), "Only suppport alltoall_single in eager mode."
# _check_single_tensor
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
in_split_sizes = [] if in_split_sizes is None else in_split_sizes
out_split_sizes = [] if out_split_sizes is None else out_split_sizes
task = group.process_group.alltoall_single(
in_tensor, out_tensor, in_split_sizes, out_split_sizes
)
if sync_op:
task.wait()
return
else:
return task
def _get_group_rank(global_rank, group=None):
return global_rank if group is None else group.get_group_rank(global_rank)
def send(tensor, dst=0, group=None, sync_op=True):
"""
Send a tensor to the receiver.
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if group is not None and not group.is_member():
return
dst = _get_group_rank(dst, group)
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
task = group.process_group.send(tensor, dst)
if sync_op:
task.wait()
return None
else:
return task
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
if _non_static_mode():
return _legacy_C_ops.send_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
dst,
)
op_type = 'send_v2'
check_variable_and_dtype(
tensor,
'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'],
'send',
)
helper = LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
attrs={
'ring_id': ring_id,
'peer': dst,
'use_calc_stream': use_calc_stream,
},
)
def recv(tensor, src=0, group=None, sync_op=True):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The Tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if group is not None and not group.is_member():
return
src = _get_group_rank(src, group)
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
task = group.process_group.recv(tensor, src)
if sync_op:
task.wait()
return None
else:
return task
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
if _non_static_mode():
return _legacy_C_ops.recv_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
src,
'dtype',
tensor.dtype,
'out_shape',
tensor.shape,
)
op_type = 'recv_v2'
check_variable_and_dtype(
tensor,
'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'],
'recv',
)
helper = LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'peer': src,
'out_shape': tensor.shape,
'dtype': tensor.dtype,
'use_calc_stream': use_calc_stream,
},
)
def _check_single_tensor(tensor, tensor_name):
if not isinstance(tensor, (core.eager.Tensor, paddle.Tensor)):
raise RuntimeError(
"Invalid function argument. Expected parameter {}"
"to be of type paddle.Tensor, but it's {}".format(
tensor_name, type(tensor)
)
)
def _check_tensor_list(tensor_list, tensor_name):
if not isinstance(tensor_list, list) or not all(
isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list
):
raise RuntimeError(
"Invalid function argument. Expected parameter {}"
"to be of type paddle.Tensor".format(tensor_name)
)
def isend(tensor, dst, group=None):
"""
Sends a tensor asynchronously
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
A distributed task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
_check_single_tensor(tensor, "tensor")
if group is not None and not group.is_member():
return
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
group_dst_rank = group.get_group_rank(dst)
assert group_dst_rank >= 0, "dst rank out of group, need global rank"
return group.process_group.send(tensor, group_dst_rank)
else:
raise RuntimeError("Only support eager dygraph mode.")
def irecv(tensor, src=None, group=None):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The Tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
A distributed task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
_check_single_tensor(tensor, "tensor")
if group is not None and not group.is_member():
return
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
group_src_rank = group.get_group_rank(src)
assert group_src_rank >= 0, "src rank out of group, need global rank"
return group.process_group.recv(tensor, group_src_rank)
else:
raise RuntimeError("Only support eager dygraph mode.")
class P2POp(object):
"""
A class that makes point-to-point operations for "batch_isend_irecv".
This class creates the type of P2P operation, communication buffer, peer rank,
Group. Instances of this class will be passed to
``paddle.distributed.batch_isend_irecv`` for point-to-point communication.
Args:
op (callable): A function to send data to or receive data from a peer process.
The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
tensor (Tensor): Tensor to send or receive.
peer (int): The destination or source rank.
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
"""
def __init__(self, op, tensor, peer, group=None):
if op not in [isend, irecv]:
raise RuntimeError(
"Invalid ``op`` function. Expected ``op`` "
"to be of type ``paddle.distributed.isend`` or "
"``paddle.distributed.irecv``."
)
_check_single_tensor(tensor, "tensor")
self.op = op
self.tensor = tensor
self.peer = peer
self.group = _get_default_group() if group is None else group
@contextlib.contextmanager
def _with_batch_p2p_guard(backend):
if backend == "nccl":
core.ProcessGroupNCCL.group_start()
try:
yield
finally:
if backend == "nccl":
core.ProcessGroupNCCL.group_end()
def _check_p2p_op_list(p2p_op_list):
"""
Helper to check that the ``p2p_op_list`` is a list of P2POp instances and
all ops use the same backend.
"""
if not isinstance(p2p_op_list, list) or not all(
isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list
):
raise RuntimeError(
"Invalid ``p2p_op_list``. Each op is expected to "
"to be of type ``paddle.distributed.P2POp``."
)
backend = _group_map_backend[p2p_op_list[0].group]
if not all(
backend == _group_map_backend[p2p_op.group] for p2p_op in p2p_op_list
):
raise RuntimeError("All groups need to use the same backend.")
def batch_isend_irecv(p2p_op_list):
"""
Send or Receive a batch of tensors asynchronously and return a list of requests.
Process each of the point-to-point operations in ``p2p_op_list`` and return the
corresponding tasks. NCCL are currently supported.
Args:
p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
``paddle.distributed.P2POp``). The order of the isend/irecv in the list
matters and it needs to match with corresponding isend/irecv on the
remote end.
Returns:
A list of distributed tasks returned by calling the corresponding
op in the op_list.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
tasks = dist.batch_isend_irecv([send_op, recv_op])
for task in tasks:
task.wait()
print(recv_t)
# paddle.tensor([1, 2]) # Rank-0
# paddle.tensor([0, 1]) # Rank-1
"""
_check_p2p_op_list(p2p_op_list)
group = p2p_op_list[0].group
if group is not None and not group.is_member():
return
if in_dygraph_mode():
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
tasks = []
with _with_batch_p2p_guard(backend):
for p2p_op in p2p_op_list:
op = p2p_op.op
tensor = p2p_op.tensor
peer = p2p_op.peer
comm_group = p2p_op.group
task = op(tensor, peer, comm_group)
if task is not None:
tasks.append(task)
return tasks
else:
raise RuntimeError("Don't support static graph mode currently.")
def reduce_scatter(
tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True
):
"""
Reduces, then scatters a list of tensors to all processes in a group
Args:
tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Async task handle, if sync_op is set to False.
None, if sync_op or if not part of the group.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([0, 1])
data2 = paddle.to_tensor([2, 3])
else:
data1 = paddle.to_tensor([4, 5])
data2 = paddle.to_tensor([6, 7])
dist.reduce_scatter(data1, [data1, data2])
print(data1)
# [4, 6] (2 GPUs, out for rank 0)
# [8, 10] (2 GPUs, out for rank 1)
"""
_check_single_tensor(tensor, "tensor")
_check_tensor_list(tensor_list, "tensor_list")
if group is not None and not group.is_member():
return
if in_dygraph_mode():
op_type = _get_reduce_op(op, "reduce_scatter")
group = _get_default_group() if group is None else group
backend = _group_map_backend[group]
assert backend != 'gloo', "backend gloo is not supported yet"
temp = paddle.concat(tensor_list, axis=0)
task = group.process_group._reduce_scatter_base(tensor, temp, op_type)
if sync_op:
task.wait()
return None
else:
return task
else:
raise RuntimeError("Don't support static graph mode currently.")
def _reduce_scatter_base(
output, input, op=ReduceOp.SUM, group=None, sync_op=True
):
"""
Reduces, then scatters a flattened tensor to all processes in a group.
Args:
output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Async task handle, if sync_op is set to False.
None, if sync_op or if not part of the group.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
data = paddle.arange(4) + rank
# [0, 1, 2, 3] (2 GPUs, for rank 0)
# [1, 2, 3, 4] (2 GPUs, for rank 1)
output = paddle.empty(shape=[2], dtype=data.dtype)
dist.collective._reduce_scatter_base(output, data)
print(output)
# [1, 3] (2 GPUs, out for rank 0)
# [5, 7] (2 GPUs, out for rank 1)
"""
_check_single_tensor(output, "output")
_check_single_tensor(input, "input")
if group is not None and not group.is_member():
return
if in_dygraph_mode():
op_type = _get_reduce_op(op, "_reduce_scatter_base")
group = _get_default_group() if group is None else group
task = group.process_group._reduce_scatter_base(output, input, op_type)
if sync_op:
task.wait()
return None
else:
return task
else:
raise RuntimeError("Don't support static graph mode currently.")
......@@ -11,3 +11,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .all_reduce import all_reduce
from .broadcast import broadcast
from .reduce import reduce, ReduceOp
from .send import send, isend
from .recv import recv, irecv
from .scatter import scatter
from .batch_isend_irecv import batch_isend_irecv, P2POp
from .reduce_scatter import reduce_scatter
from .all_to_all import alltoall, alltoall_single
from .group import is_initialized, destroy_process_group, get_group
__all__ = [
"ReduceOp",
"all_reduce",
"alltoall",
"alltoall_single",
"broadcast",
"reduce",
"send",
"scatter",
"isend",
"recv",
"irecv",
"batch_isend_irecv",
"P2POp",
"reduce_scatter",
"is_initialized",
"destroy_process_group",
"get_group",
]
......@@ -14,7 +14,7 @@
import paddle
import paddle.fluid.framework as framework
from paddle.distributed.communication import stream as stream
import paddle.distributed.communication.stream as stream
from paddle.distributed.communication.reduce import ReduceOp
......@@ -63,6 +63,8 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
if op == ReduceOp.SUM:
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
"""
Scatter tensors in in_tensor_list to all participators averagely and gather the result tensors in out_tensor_list.
As shown below, the in_tensor_list in GPU0 includes 0_0 and 0_1, and GPU1 includes 1_0 and 1_1.
Through alltoall operator, the 0_0 in GPU0 will be sent to GPU0 and 0_1 to GPU1, 1_0 in GPU1 sent to GPU0 and 1_1 to GPU1.
Finally the out_tensor_list in GPU0 includes 0_0 and 1_0, and GPU1 includes 0_1 and 1_1.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/alltoall.png
:width: 800
:alt: alltoall
:align: center
Args:
in_tensor_list (List[Tensor]): List of tensors to scatter one per rank. The data type of each tensor
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor_list (List[Tensor]): List of tensors to be gathered one per rank. The data type of each tensor should be the same as the input tensors.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
out_tensor_list = []
if dist.get_rank() == 0:
data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
else:
data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]])
data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]])
dist.alltoall([data1, data2], out_tensor_list)
print(out_tensor_list)
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.alltoall(
out_tensor_list, in_tensor_list, group, sync_op, False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
temp = paddle.concat(in_tensor_list, axis=0)
nranks = len(in_tensor_list)
use_calc_stream = sync_op
out = paddle._legacy_C_ops.alltoall(
temp, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
out_tensor_list.extend(paddle.split(out, nranks, 0))
def alltoall_single(
in_tensor,
out_tensor,
in_split_sizes=None,
out_split_sizes=None,
group=None,
sync_op=True,
):
"""
Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
Note:
``alltoall_single`` is only supported in eager mode.
Args:
in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
size = dist.get_world_size()
# case 1 (2 GPUs)
data = paddle.arange(2, dtype='int64') + rank * 2
# data for rank 0: [0, 1]
# data for rank 1: [2, 3]
output = paddle.empty([2], dtype='int64')
dist.alltoall_single(data, output)
print(output)
# output for rank 0: [0, 2]
# output for rank 1: [1, 3]
# case 2 (2 GPUs)
in_split_sizes = [i + 1 for i in range(size)]
# in_split_sizes for rank 0: [1, 2]
# in_split_sizes for rank 1: [1, 2]
out_split_sizes = [rank + 1 for i in range(size)]
# out_split_sizes for rank 0: [1, 1]
# out_split_sizes for rank 1: [2, 2]
data = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
# data for rank 0: [[0., 0.], [0., 0.], [0., 0.]]
# data for rank 1: [[1., 1.], [1., 1.], [1., 1.]]
output = paddle.empty([(rank + 1) * size, size], dtype='float32')
group = dist.new_group([0, 1])
task = dist.alltoall_single(data,
output,
in_split_sizes,
out_split_sizes,
sync_op=False,
group=group)
task.wait()
print(output)
# output for rank 0: [[0., 0.], [1., 1.]]
# output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
"""
if not framework._in_legacy_dygraph():
return stream.alltoall_single(
out_tensor,
in_tensor,
out_split_sizes,
in_split_sizes,
group,
sync_op,
False,
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import paddle.distributed as dist
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
)
class P2POp(object):
"""
A class that makes point-to-point operations for "batch_isend_irecv".
This class creates the type of P2P operation, communication buffer, peer rank,
Group. Instances of this class will be passed to
``paddle.distributed.batch_isend_irecv`` for point-to-point communication.
Args:
op (callable): A function to send data to or receive data from a peer process.
The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
tensor (Tensor): Tensor to send or receive.
peer (int): The destination or source rank.
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
"""
def __init__(self, op, tensor, peer, group=None):
if op not in [dist.isend, dist.irecv]:
raise RuntimeError(
"Invalid ``op`` function. Expected ``op`` "
"to be of type ``paddle.distributed.isend`` or "
"``paddle.distributed.irecv``."
)
self.op = op
self.tensor = tensor
self.peer = peer
self.group = _get_global_group() if group is None else group
@contextlib.contextmanager
def _with_batch_p2p_guard(backend):
if backend == "NCCL":
core.ProcessGroupNCCL.group_start()
try:
yield
finally:
if backend == "NCCL":
core.ProcessGroupNCCL.group_end()
def _check_p2p_op_list(p2p_op_list):
"""
Helper to check that the ``p2p_op_list`` is a list of P2POp instances and
all ops use the same backend.
"""
if not isinstance(p2p_op_list, list) or not all(
isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list
):
raise RuntimeError(
"Invalid ``p2p_op_list``. Each op is expected to "
"to be of type ``paddle.distributed.P2POp``."
)
backend = p2p_op_list[0].group.backend
if not all(backend == p2p_op.group.backend for p2p_op in p2p_op_list):
raise RuntimeError("All groups need to use the same backend.")
def batch_isend_irecv(p2p_op_list):
"""
Send or Receive a batch of tensors asynchronously and return a list of requests.
Process each of the point-to-point operations in ``p2p_op_list`` and return the
corresponding tasks. NCCL are currently supported.
Args:
p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
``paddle.distributed.P2POp``). The order of the isend/irecv in the list
matters and it needs to match with corresponding isend/irecv on the
remote end.
Returns:
A list of distributed tasks returned by calling the corresponding
op in the op_list.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
tasks = dist.batch_isend_irecv([send_op, recv_op])
for task in tasks:
task.wait()
print(recv_t)
# paddle.tensor([1, 2]) # Rank-0
# paddle.tensor([0, 1]) # Rank-1
"""
_check_p2p_op_list(p2p_op_list)
group = p2p_op_list[0].group
if _warn_cur_rank_not_in_group(group):
return
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
backend = group.backend
tasks = []
with _with_batch_p2p_guard(backend):
for p2p_op in p2p_op_list:
op = p2p_op.op
tensor = p2p_op.tensor
peer = p2p_op.peer
comm_group = p2p_op.group
task = op(tensor, peer, comm_group)
if task is not None:
tasks.append(task)
return tasks
else:
raise RuntimeError("Don't support static graph mode currently.")
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
def broadcast(tensor, src, group=None, sync_op=True):
"""
Broadcast a tensor from the source to all others.
As shown below, one process is started with a GPU and GPU0 owns data 0. Through broadcast operator,
data 0 will be sent to all GPUs from GPU0.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/broadcast.png
:width: 800
:alt: broadcast
:align: center
Args:
tensor (Tensor): The tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank in global view.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.broadcast(data, src=1)
print(data)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.broadcast(
tensor,
src,
group=group,
sync_op=sync_op,
use_calc_stream=False,
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
assert gsrc >= 0, "src rank out of group, need global rank"
return paddle._legacy_C_ops.c_broadcast(
tensor,
tensor,
'root',
gsrc,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
)
......@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import paddle.distributed as dist
class Group:
"""
......@@ -50,6 +53,10 @@ class Group:
def world_size(self):
return self._world_size
@property
def backend(self):
return self._pg.name()
@property
def id(self):
return self._id
......@@ -94,3 +101,129 @@ def _add_new_group(group):
"The group with id {} already exist.".format(group.id)
)
_GroupManager.group_map_by_id[group.id] = group
def _is_global_group(group):
return group.id == _GroupManager.global_group_id
def _warn_cur_rank_not_in_group(group):
global_rank = dist.get_rank()
if group and not group.is_member():
warnings.warn(
"Current global rank {} is not in group {}".format(
global_rank, group.name
)
)
return True
return False
def _get_or_throw_group_rank(global_rank, group):
group_rank = group.get_group_rank(global_rank)
assert (
group_rank >= 0
), "The input rank {} can not be found inside the group {}".format(
global_rank, group.name
)
return group_rank
def is_initialized():
"""
Check whether the distributed environment has been initialized
Returns:
`True` if distributed environment has been initialized, otherwise `False`.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
print(paddle.distributed.is_initialized())
# False
paddle.distributed.init_parallel_env()
print(paddle.distributed.is_initialized())
# True
"""
return _GroupManager.global_group_id in _GroupManager.group_map_by_id
def destroy_process_group(group=None):
"""
Destroy a given group for communication
Args:
group (Group, optional): The group to be destroyed. All of process groups, including
the default group, will be destroyed and the distributed
environment will be deinitialized.
Returns : None
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
group = dist.new_group([0, 1])
dist.destroy_process_group(group)
print(dist.is_initialized())
# True
dist.destroy_process_group()
print(dist.is_initialized())
# False
"""
group = _get_global_group() if group is None else group
assert (
group.id in _GroupManager.group_map_by_id
), "Destroy group with id {} is invalid.".format(group.id)
if _is_global_group(group):
_GroupManager.group_map_by_id.clear()
else:
del _GroupManager.group_map_by_id[group.id]
def get_group(id=0):
"""
Get group instance by group id.
Args:
id (int): the group id. Default value is 0.
Returns:
Group: the group instance.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
gid = paddle.distributed.new_group([2,4,6])
paddle.distributed.get_group(gid.id)
"""
if id in _GroupManager.group_map_by_id:
return _GroupManager.group_map_by_id[id]
warnings.warn("Group {} is not initialized.".format(id))
return None
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
def recv(tensor, src=0, group=None, sync_op=True):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.recv(
tensor, src=src, group=group, sync_op=sync_op, use_calc_stream=False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
gsrc = src if group is None else group.get_group_rank(src)
ring_id = 0 if group is None else group.id
return paddle._legacy_C_ops.recv_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
src,
'dtype',
tensor.dtype,
'out_shape',
tensor.shape,
)
def irecv(tensor, src=None, group=None):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The Tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
return recv(tensor, src, group, sync_op=False)
......@@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.fluid.core as core
import paddle.distributed.communication.stream as stream
class ReduceOp:
......@@ -66,12 +68,121 @@ def _get_reduce_op(reduce_op, func_name):
return core.ReduceOp.PRODUCT
else:
if reduce_op == ReduceOp.SUM:
return 'c_allreduce_sum'
return 'c_{}_sum'.format(func_name)
elif reduce_op == ReduceOp.MAX:
return 'c_allreduce_max'
return 'c_{}_max'.format(func_name)
elif reduce_op == ReduceOp.MIN:
return 'c_allreduce_min'
return 'c_{}_min'.format(func_name)
elif reduce_op == ReduceOp.PROD:
return 'c_allreduce_prod'
return 'c_{}_prod'.format(func_name)
else:
return 'c_{}'.format(func_name)
raise ValueError("Unknown reduce_op type for {}.".format(func_name))
def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
"""
Reduce a tensor to the destination from all others. As shown below, one process is started with a GPU and the data of this process is represented
by its group rank. The destination of the reduce operator is GPU0 and the process is sum. Through reduce operator,
the GPU0 will owns the sum of all data from all GPUs.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/reduce.png
:width: 800
:alt: reduce
:align: center
Args:
tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.reduce(data, dst=0)
print(data)
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.reduce(
tensor,
dst=dst,
op=op,
group=group,
sync_op=sync_op,
use_calc_stream=False,
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
gdst = dst if group is None else group.get_group_rank(dst)
assert gdst >= 0, "dst rank out of group, need global rank"
if op == ReduceOp.SUM:
return paddle._legacy_C_ops.c_reduce_sum(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.MAX:
return paddle._legacy_C_ops.c_reduce_max(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.MIN:
return paddle._legacy_C_ops.c_reduce_min(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
elif op == ReduceOp.PROD:
return paddle._legacy_C_ops.c_reduce_prod(
tensor,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'root_id',
gdst,
)
else:
raise ValueError("Unknown parameter: {}.".format(op))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
from paddle.distributed.communication.reduce import ReduceOp
from paddle.distributed.communication.stream.reduce_scatter import (
_reduce_scatter_base as _reduce_scatter_base_stream,
)
def reduce_scatter(
tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True
):
"""
Reduces, then scatters a list of tensors to all processes in a group
Args:
tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
tensor_list (List[Tensor]]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([0, 1])
data2 = paddle.to_tensor([2, 3])
else:
data1 = paddle.to_tensor([4, 5])
data2 = paddle.to_tensor([6, 7])
dist.reduce_scatter(data1, [data1, data2])
print(data1)
# [4, 6] (2 GPUs, out for rank 0)
# [8, 10] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.reduce_scatter(
tensor,
tensor_list,
op=op,
group=group,
sync_op=sync_op,
use_calc_stream=False,
)
def _reduce_scatter_base(
output, input, op=ReduceOp.SUM, group=None, sync_op=True
):
"""
Reduces, then scatters a flattened tensor to all processes in a group.
Args:
output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Async task handle, if sync_op is set to False.
None, if sync_op or if not part of the group.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
data = paddle.arange(4) + rank
# [0, 1, 2, 3] (2 GPUs, for rank 0)
# [1, 2, 3, 4] (2 GPUs, for rank 1)
output = paddle.empty(shape=[2], dtype=data.dtype)
dist.collective._reduce_scatter_base(output, data)
print(output)
# [1, 3] (2 GPUs, out for rank 0)
# [5, 7] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return _reduce_scatter_base_stream(
output,
input,
op=op,
group=group,
sync_op=sync_op,
use_calc_stream=False,
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
from paddle.distributed.communication.group import _get_global_group
def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
"""
Scatter a tensor to all participators. As shown below, one process is started with a GPU and the source of the scatter
is GPU0. Through scatter operator, the data in GPU0 will be sent to all GPUs averagely.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/scatter.png
:width: 800
:alt: scatter
:align: center
Args:
tensor (Tensor): The output Tensor. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
src (int): The source rank id. Default value is 0.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([7, 8, 9])
data2 = paddle.to_tensor([10, 11, 12])
dist.scatter(data1, src=1)
else:
data1 = paddle.to_tensor([1, 2, 3])
data2 = paddle.to_tensor([4, 5, 6])
dist.scatter(data1, tensor_list=[data1, data2], src=1)
print(data1, data2)
# [1, 2, 3] [10, 11, 12] (2 GPUs, out for rank 0)
# [4, 5, 6] [4, 5, 6] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.scatter(tensor, tensor_list, src, group, sync_op)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
rank = _get_global_group().rank if group is None else group.rank
nranks = _get_global_group().nranks if group is None else group.nranks
assert gsrc >= 0, "src rank out of group, need global rank"
if rank != gsrc:
tensor_list = []
for _ in range(nranks):
tensor_list.append(tensor)
temp = paddle.concat(tensor_list, axis=0)
use_calc_stream = sync_op
return framework._legacy_C_ops.c_scatter(
temp,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'nranks',
nranks,
'root',
gsrc,
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid.framework as framework
import paddle.distributed.communication.stream as stream
def send(tensor, dst=0, group=None, sync_op=True):
"""
Send a tensor to the receiver.
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.send(
tensor, dst=dst, group=group, sync_op=sync_op, use_calc_stream=False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
gdst = dst if group is None else group.get_group_rank(dst)
assert gdst >= 0, "dst rank out of group, need global rank"
ring_id = 0 if group is None else group.id
return paddle._legacy_C_ops.send_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
gdst,
)
def isend(tensor, dst, group=None):
"""
Send tensor asynchronously
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
return send(tensor, dst, group, sync_op=False)
......@@ -14,8 +14,7 @@
from .all_gather import all_gather
from .all_reduce import all_reduce
from .alltoall import alltoall
from .alltoall_single import alltoall_single
from .all_to_all import alltoall, alltoall_single
from .broadcast import broadcast
from .reduce import reduce
from .reduce_scatter import reduce_scatter
......
......@@ -16,11 +16,14 @@ import paddle.fluid.framework as framework
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
from paddle.distributed.communication.group import _get_global_group
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
)
def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
op_type = _get_reduce_op(op, "all_reduce")
op_type = _get_reduce_op(op, "allreduce")
group = _get_global_group() if group is None else group
if use_calc_stream:
......@@ -50,7 +53,7 @@ def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream):
'all_reduce',
)
op_type = _get_reduce_op(op, "all_reduce")
op_type = _get_reduce_op(op, "allreduce")
ring_id = 0 if group is None else group.id
if not isinstance(ring_id, int):
......@@ -107,10 +110,8 @@ def all_reduce(
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -122,6 +123,7 @@ def all_reduce(
tensor, op, group, sync_op, use_calc_stream
)
else:
assert group is None, "Group can not be used in static mode for now."
return _all_reduce_in_static_mode(
tensor, op, group, sync_op, use_calc_stream
)
......@@ -14,7 +14,13 @@
import paddle
import paddle.fluid.framework as framework
from paddle.distributed import collective
import paddle.distributed as dist
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
)
def _check_tensor_shape(tensor, shape, nranks=1):
......@@ -34,10 +40,9 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1):
)
def _alltoall_tensor_in_dygraph(
def _all_to_all_tensor_in_dygraph(
out_tensor, in_tensor, group, sync_op, use_calc_stream
):
group = collective._get_default_group() if group is None else group
_check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
......@@ -53,11 +58,9 @@ def _alltoall_tensor_in_dygraph(
return task
def _alltoall_in_dygraph(
def _all_to_all_in_dygraph(
out_tensor_list, in_tensor_list, group, sync_op, use_calc_stream
):
group = collective._get_default_group() if group is None else group
if len(in_tensor_list) == 0:
raise RuntimeError("The input tensor_list should not be empty.")
......@@ -84,6 +87,59 @@ def _alltoall_in_dygraph(
return task
def _all_to_all_in_static_mode(
out_tensor_or_tensor_list,
in_tensor_or_tensor_list,
group,
sync_op,
use_calc_stream,
):
op_type = 'alltoall'
ring_id = 0 if group is None else group.id
nranks = dist.get_world_size()
helper = layer_helper.LayerHelper(op_type, **locals())
in_tensor = in_tensor_or_tensor_list
if isinstance(in_tensor_or_tensor_list, list):
if len(in_tensor_or_tensor_list) == 0:
raise RuntimeError("The input tensor_list should not be empty.")
in_tensor = paddle.concat(in_tensor_or_tensor_list, axis=0)
out_tensor = out_tensor_or_tensor_list
if isinstance(out_tensor_or_tensor_list, list):
if len(out_tensor_or_tensor_list) != 0:
raise ValueError(
"The 'out_tensor_list' for all_to_all " "must be an empty list."
)
out_tensor = helper.create_variable_for_type_inference(
dtype=in_tensor.dtype
)
data_feeder.check_variable_and_dtype(
in_tensor,
'in_tensor',
['float16', 'float32', 'float64', 'int32', 'int64'],
'all_to_all',
)
helper.append_op(
type=op_type,
inputs={'X': [in_tensor]},
outputs={'Out': [out_tensor]},
attrs={
'ring_id': ring_id,
'use_calc_stream': sync_op,
},
)
# NOTE(liyurui): If the argument `out_tensor_or_tensor_list` is a tensor_list,
# we need to split the result. So we should wait the result of all_to_all
# before split if the communication is not on calc stream.
if isinstance(out_tensor_or_tensor_list, list):
if not sync_op:
dist.wait(out_tensor, use_calc_stream=False)
out_tensor_or_tensor_list.extend(paddle.split(out_tensor, nranks, 0))
return None
def alltoall(
out_tensor_or_tensor_list,
in_tensor_or_tensor_list,
......@@ -109,9 +165,6 @@ def alltoall(
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
......@@ -133,10 +186,8 @@ def alltoall(
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -149,10 +200,11 @@ def alltoall(
raise RuntimeError("The input should be specified.")
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
out_is_tensor = paddle.is_tensor(out_tensor_or_tensor_list)
in_is_tensor = paddle.is_tensor(in_tensor_or_tensor_list)
if out_is_tensor and in_is_tensor:
return _alltoall_tensor_in_dygraph(
return _all_to_all_tensor_in_dygraph(
out_tensor_or_tensor_list,
in_tensor_or_tensor_list,
group,
......@@ -160,7 +212,7 @@ def alltoall(
use_calc_stream,
)
elif not out_is_tensor and not in_is_tensor:
return _alltoall_in_dygraph(
return _all_to_all_in_dygraph(
out_tensor_or_tensor_list,
in_tensor_or_tensor_list,
group,
......@@ -171,7 +223,137 @@ def alltoall(
raise RuntimeError(
"The output and input should be both tensor or tensor list."
)
else:
assert group is None, "Group can not be used in static mode for now."
return _all_to_all_in_static_mode(
out_tensor_or_tensor_list,
in_tensor_or_tensor_list,
group,
sync_op,
use_calc_stream,
)
def _alltoall_single_in_dygraph(
out_tensor,
in_tensor,
out_split_sizes,
in_split_sizes,
group,
sync_op,
use_calc_stream,
):
if out_split_sizes is None:
out_split_sizes = []
if in_split_sizes is None:
in_split_sizes = []
if use_calc_stream:
return group.process_group.alltoall_single_on_calc_stream(
in_tensor, out_tensor, in_split_sizes, out_split_sizes
)
task = group.process_group.alltoall_single(
in_tensor, out_tensor, in_split_sizes, out_split_sizes, sync_op
)
if sync_op:
task.wait()
return task
def alltoall_single(
out_tensor,
in_tensor,
out_split_sizes=None,
in_split_sizes=None,
group=None,
sync_op=True,
use_calc_stream=False,
):
"""
Split and Scatter the splitted input tensor to the out tensor across devices.
Args:
out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
# case 1
output = paddle.empty([2], dtype="int64")
if local_rank == 0:
data = paddle.to_tensor([0, 1])
else:
data = paddle.to_tensor([2, 3])
task = dist.stream.alltoall_single(output, data, sync_op=False)
task.wait()
out = output.numpy()
# [0, 2] (2 GPUs, out for rank 0)
# [1, 3] (2 GPUs, out for rank 1)
# case 2
size = dist.get_world_size()
output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
if local_rank == 0:
data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
else:
data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
out_split_sizes = [local_rank + 1 for i in range(size)]
in_split_sizes = [i + 1 for i in range(size)]
task = dist.stream.alltoall_single(output,
data,
out_split_sizes,
in_split_sizes,
sync_op=False)
task.wait()
out = output.numpy()
# [[0., 0.], [1., 1.]] (2 GPUs, out for rank 0)
# [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
"""
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
"use_calc_stream can only be true in sync op behavior."
)
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
return _alltoall_single_in_dygraph(
out_tensor,
in_tensor,
out_split_sizes,
in_split_sizes,
group,
sync_op,
use_calc_stream,
)
raise RuntimeError(
"paddle.distributed.stream.alltoall is only supported in dygraph mode now."
"paddle.distributed.stream.alltoall_single is only supported in dygraph mode now."
)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.framework as framework
from paddle.distributed import collective
def _alltoall_single_in_dygraph(
out_tensor,
in_tensor,
out_split_sizes,
in_split_sizes,
group,
sync_op,
use_calc_stream,
):
group = collective._get_default_group() if group is None else group
if out_split_sizes is None:
out_split_sizes = []
if in_split_sizes is None:
in_split_sizes = []
if use_calc_stream:
return group.process_group.alltoall_single_on_calc_stream(
in_tensor, out_tensor, in_split_sizes, out_split_sizes
)
task = group.process_group.alltoall_single(
in_tensor, out_tensor, in_split_sizes, out_split_sizes, sync_op
)
if sync_op:
task.wait()
return task
def alltoall_single(
out_tensor,
in_tensor,
out_split_sizes=None,
in_split_sizes=None,
group=None,
sync_op=True,
use_calc_stream=False,
):
"""
Split and Scatter the splitted input tensor to the out tensor across devices.
Args:
out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
# case 1
output = paddle.empty([2], dtype="int64")
if local_rank == 0:
data = paddle.to_tensor([0, 1])
else:
data = paddle.to_tensor([2, 3])
task = dist.stream.alltoall_single(output, data, sync_op=False)
task.wait()
out = output.numpy()
# [0, 2] (2 GPUs, out for rank 0)
# [1, 3] (2 GPUs, out for rank 1)
# case 2
size = dist.get_world_size()
output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
if local_rank == 0:
data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
else:
data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
out_split_sizes = [local_rank + 1 for i in range(size)]
in_split_sizes = [i + 1 for i in range(size)]
task = dist.stream.alltoall_single(output,
data,
out_split_sizes,
in_split_sizes,
sync_op=False)
task.wait()
out = output.numpy()
# [[0., 0.], [1., 1.]] (2 GPUs, out for rank 0)
# [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if not sync_op and use_calc_stream:
raise RuntimeError(
"use_calc_stream can only be true in sync op behavior."
)
if framework.in_dygraph_mode():
return _alltoall_single_in_dygraph(
out_tensor,
in_tensor,
out_split_sizes,
in_split_sizes,
group,
sync_op,
use_calc_stream,
)
raise RuntimeError(
"paddle.distributed.stream.alltoall_single is only supported in dygraph mode now."
)
......@@ -13,29 +13,74 @@
# limitations under the License.
import paddle.fluid.framework as framework
from paddle.distributed import collective
def _broadcast_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
group = collective._get_default_group() if group is None else group
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
_get_or_throw_group_rank,
)
def _broadcast_in_dygraph(
tensor, src_rank_in_group, group, sync_op, use_calc_stream
):
if use_calc_stream:
return group.process_group.broadcast_on_calc_stream(tensor, src)
return group.process_group.broadcast_on_calc_stream(
tensor, src_rank_in_group
)
task = group.process_group.broadcast(tensor, src, sync_op)
task = group.process_group.broadcast(tensor, src_rank_in_group, sync_op)
if sync_op:
task.wait()
return task
def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
def _broadcast_in_static_mode(
tensor, src_rank_in_group, group, sync_op, use_calc_stream
):
data_feeder.check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'broadcast',
)
op_type = 'c_broadcast'
helper = layer_helper.LayerHelper(op_type, **locals())
ring_id = 0 if group is None else group.id
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
outputs={'Out': [tensor]},
attrs={
'root': src_rank_in_group,
'use_calc_stream': sync_op,
'ring_id': ring_id,
},
)
return None
def broadcast(tensor, src, group=None, sync_op=True, use_calc_stream=False):
"""
Broadcast a tensor to all devices.
Args:
tensor (Tensor): The tensor to broadcast. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
src (int, optional): Rank of the source device. If none is given, use `0` as default.
src (int, optional): Rank of the source device.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
......@@ -65,10 +110,8 @@ def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -76,10 +119,14 @@ def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
)
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
src_rank_in_group = _get_or_throw_group_rank(src, group)
return _broadcast_in_dygraph(
tensor, src, group, sync_op, use_calc_stream
tensor, src_rank_in_group, group, sync_op, use_calc_stream
)
raise RuntimeError(
"paddle.distributed.stream.broadcast is only supported in dygraph mode now."
else:
assert group is None, "Group can not be used in static mode for now."
return _broadcast_in_static_mode(
tensor, src, group, sync_op, use_calc_stream
)
......@@ -13,21 +13,56 @@
# limitations under the License.
import paddle.fluid.framework as framework
from paddle.distributed import collective
def _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
group = collective._get_default_group() if group is None else group
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
_get_or_throw_group_rank,
)
def _recv_in_dygraph(
tensor, src_rank_in_group, group, sync_op, use_calc_stream
):
if use_calc_stream:
return group.process_group.recv_on_calc_stream(tensor, src)
return group.process_group.recv_on_calc_stream(
tensor, src_rank_in_group
)
task = group.process_group.recv(tensor, src, sync_op)
task = group.process_group.recv(tensor, src_rank_in_group, sync_op)
if sync_op:
task.wait()
return task
def _recv_in_static_mode(
tensor, src_rank_in_group, group, sync_op, use_calc_stream
):
op_type = 'recv_v2'
data_feeder.check_variable_and_dtype(
tensor,
'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'],
'recv',
)
ring_id = 0 if group is None else group.id
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'peer': src_rank_in_group,
'out_shape': tensor.shape,
'dtype': tensor.dtype,
'use_calc_stream': sync_op,
},
)
return None
def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
"""
......@@ -44,9 +79,6 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
......@@ -66,10 +98,8 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]] (2 GPUs)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -77,8 +107,14 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
)
if framework.in_dygraph_mode():
return _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream)
group = _get_global_group() if group is None else group
src_rank_in_group = _get_or_throw_group_rank(src, group)
raise RuntimeError(
"paddle.distributed.stream.recv is only supported in dygraph mode now."
return _recv_in_dygraph(
tensor, src_rank_in_group, group, sync_op, use_calc_stream
)
else:
assert group is None, "Group can not be used in static mode for now."
return _recv_in_static_mode(
tensor, src, group, sync_op, use_calc_stream
)
......@@ -13,23 +13,70 @@
# limitations under the License.
import paddle.fluid.framework as framework
from paddle.distributed.communication.group import _get_global_group
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
_get_or_throw_group_rank,
)
from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
def _reduce_in_dygraph(tensor, dst, op, group, sync_op, use_calc_stream):
def _reduce_in_dygraph(
tensor, dst_rank_in_group, op, group, sync_op, use_calc_stream
):
op_type = _get_reduce_op(op, "reduce")
group = _get_global_group() if group is None else group
if use_calc_stream:
return group.process_group.reduce_on_calc_stream(tensor, dst, op_type)
return group.process_group.reduce_on_calc_stream(
tensor, dst_rank_in_group, op_type
)
task = group.process_group.reduce(tensor, dst, op_type, sync_op)
task = group.process_group.reduce(
tensor, dst_rank_in_group, op_type, sync_op
)
if sync_op:
task.wait()
return task
def _reduce_in_static_mode(
tensor, dst_rank_in_group, op, group, sync_op, use_calc_stream
):
data_feeder.check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'reduce',
)
op_type = _get_reduce_op(op, "reduce")
ring_id = 0 if group is None else group.id
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'use_calc_stream': sync_op,
'root_id': dst_rank_in_group,
},
)
return None
def reduce(
tensor,
dst=0,
......@@ -77,10 +124,8 @@ def reduce(
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -88,10 +133,13 @@ def reduce(
)
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
dst_rank_in_group = _get_or_throw_group_rank(dst, group)
return _reduce_in_dygraph(
tensor, dst, op, group, sync_op, use_calc_stream
tensor, dst_rank_in_group, op, group, sync_op, use_calc_stream
)
raise RuntimeError(
"paddle.distributed.stream.reduce is only supported in dygraph mode now."
else:
assert group is None, "Group can not be used in static mode for now."
return _reduce_in_static_mode(
tensor, dst, op, group, sync_op, use_calc_stream
)
......@@ -14,7 +14,10 @@
import paddle
import paddle.fluid.framework as framework
from paddle.distributed.communication.group import _get_global_group
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
)
from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
......@@ -104,7 +107,7 @@ def reduce_scatter(
Args:
tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
tensor_list (List[Tensor]]): The input to scatter.
tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter.
If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
......@@ -137,10 +140,8 @@ def reduce_scatter(
# [4, 6] (2 GPUs, out for rank 0)
# [8, 10] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -220,10 +221,8 @@ def _reduce_scatter_base(
# [1, 2, 3] (2 GPUs, out for rank 0)
# [4, 5, 6] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......
......@@ -12,10 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import paddle
import paddle.distributed as dist
import paddle.fluid.framework as framework
from paddle.distributed import collective
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
_get_or_throw_group_rank,
)
def _check_tensor_shape(tensor, shape, nranks=1):
......@@ -38,26 +45,19 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1):
def _scatter_tensor_in_dygraph(
out_tensor, in_tensor, src, group, sync_op, use_calc_stream
out_tensor, in_tensor, src_rank_in_group, group, sync_op, use_calc_stream
):
group = collective._get_default_group() if group is None else group
src_rank = group.get_group_rank(src)
if src_rank == -1:
raise RuntimeError("Src rank out of group.")
nranks = group.nranks
rank = dist.get_rank()
if rank == src_rank:
if group.rank == src_rank_in_group:
_check_tensor_shape(out_tensor, in_tensor.shape, nranks)
if use_calc_stream:
return group.process_group.scatter_tensor_on_calc_stream(
in_tensor, out_tensor, src
in_tensor, out_tensor, src_rank_in_group
)
task = group.process_group.scatter_tensor(
in_tensor, out_tensor, src, sync_op
in_tensor, out_tensor, src_rank_in_group, sync_op
)
if sync_op:
task.wait()
......@@ -66,17 +66,10 @@ def _scatter_tensor_in_dygraph(
def _scatter_in_dygraph(
tensor, tensor_list, src, group, sync_op, use_calc_stream
tensor, tensor_list, src_rank_in_group, group, sync_op, use_calc_stream
):
group = collective._get_default_group() if group is None else group
src_rank = group.get_group_rank(src)
if src_rank == -1:
raise RuntimeError("Src rank out of group.")
nranks = group.nranks
rank = dist.get_rank()
if rank == src_rank:
if group.rank == src_rank_in_group:
if len(tensor_list) == 0:
raise RuntimeError(
"The tensor_list should not be empty on src rank."
......@@ -87,16 +80,76 @@ def _scatter_in_dygraph(
if use_calc_stream:
return group.process_group.scatter_on_calc_stream(
tensor_list, tensor, src
tensor_list, tensor, src_rank_in_group
)
task = group.process_group.scatter(tensor_list, tensor, src, sync_op)
task = group.process_group.scatter(
tensor_list, tensor, src_rank_in_group, sync_op
)
if sync_op:
task.wait()
return task
def _scatter_in_static_mode(
tensor,
tensor_or_tensor_list,
src_rank_in_group,
group,
sync_op,
use_calc_stream,
):
nranks = dist.get_world_size() if group is None else group.nranks
rank = dist.get_rank()
input_tensor = tensor_or_tensor_list
if isinstance(tensor_or_tensor_list, list):
tensor_list = tensor_or_tensor_list
if rank == src_rank_in_group:
if len(tensor_list) == 0:
raise RuntimeError(
"The tensor_list should not be empty on src rank."
)
else:
tensor_list = [tensor for _ in range(nranks)]
input_tensor = paddle.concat(tensor_list, axis=0)
ring_id = 0 if group is None else group.id
data_feeder.check_variable_and_dtype(
tensor,
'tensor',
[
'float16',
'float32',
'float64',
'int32',
'int64',
'int8',
'uint8',
'bool',
],
'scatter',
)
op_type = 'c_scatter'
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [input_tensor]},
outputs={'Out': [tensor]},
attrs={
'ring_id': ring_id,
'root': src_rank_in_group,
'use_calc_stream': sync_op,
'nranks': nranks,
},
)
return None
def scatter(
tensor,
tensor_or_tensor_list=None,
......@@ -146,25 +199,34 @@ def scatter(
# [1, 2, 3] (2 GPUs, out for rank 0)
# [4, 5, 6] (2 GPUs, out for rank 1)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
"use_calc_stream can only be true in sync op behavior."
)
if tensor_or_tensor_list is None:
raise RuntimeError("The input should be specified.")
# NOTE(liyurui): Only the source rank needs to specific the tensor_or_tensor_list argument.
# Other ranks which pass this argument in will be ignored with a warning.
# If a tensor_list passed in, we need to concat it to a tensor before invoke C++ API.
# If a tensor passed in, concat is not needed.
# The passed in type for non-src rank is meaningless, for it will be ignored.
if src != dist.get_rank():
if tensor_or_tensor_list is not None:
warnings.warn(
"Specific `tensor_or_tensor_list` is meaningless for rank which is not src."
)
tensor_or_tensor_list = []
if framework.in_dygraph_mode():
group = _get_global_group() if group is None else group
src_rank_in_group = _get_or_throw_group_rank(src, group)
if paddle.is_tensor(tensor_or_tensor_list):
return _scatter_tensor_in_dygraph(
tensor,
tensor_or_tensor_list,
src,
src_rank_in_group,
group,
sync_op,
use_calc_stream,
......@@ -173,12 +235,19 @@ def scatter(
return _scatter_in_dygraph(
tensor,
tensor_or_tensor_list,
src,
src_rank_in_group,
group,
sync_op,
use_calc_stream,
)
else:
assert group is None, "Group can not be used in static mode for now."
raise RuntimeError(
"paddle.distributed.stream.scatter is only supported in dygraph mode now."
return _scatter_in_static_mode(
tensor,
tensor_or_tensor_list,
src,
group,
sync_op,
use_calc_stream,
)
......@@ -13,21 +13,55 @@
# limitations under the License.
import paddle.fluid.framework as framework
from paddle.distributed import collective
def _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream):
group = collective._get_default_group() if group is None else group
import paddle.fluid.data_feeder as data_feeder
import paddle.fluid.layer_helper as layer_helper
from paddle.distributed.communication.group import (
_get_global_group,
_warn_cur_rank_not_in_group,
_get_or_throw_group_rank,
)
def _send_in_dygraph(
tensor, dst_rank_in_group, group, sync_op, use_calc_stream
):
if use_calc_stream:
return group.process_group.send_on_calc_stream(tensor, dst)
return group.process_group.send_on_calc_stream(
tensor, dst_rank_in_group
)
task = group.process_group.send(tensor, dst, sync_op)
task = group.process_group.send(tensor, dst_rank_in_group, sync_op)
if sync_op:
task.wait()
return task
def _send_in_static_mode(
tensor, dst_rank_in_group, group, sync_op, use_calc_stream
):
op_type = 'send_v2'
data_feeder.check_variable_and_dtype(
tensor,
'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'],
'send',
)
ring_id = 0 if group is None else group.id
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
type=op_type,
inputs={'X': [tensor]},
attrs={
'ring_id': ring_id,
'peer': dst_rank_in_group,
'use_calc_stream': sync_op,
},
)
return None
def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
"""
......@@ -44,9 +78,6 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
......@@ -66,10 +97,8 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]] (2 GPUs)
"""
if group is not None and not group.is_member():
raise RuntimeError(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if _warn_cur_rank_not_in_group(group):
return
if not sync_op and use_calc_stream:
raise RuntimeError(
......@@ -77,8 +106,14 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
)
if framework.in_dygraph_mode():
return _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream)
group = _get_global_group() if group is None else group
dst_rank_in_group = _get_or_throw_group_rank(dst, group)
raise RuntimeError(
"paddle.distributed.stream.send is only supported in dygraph mode now."
return _send_in_dygraph(
tensor, dst_rank_in_group, group, sync_op, use_calc_stream
)
else:
assert group is None, "Group can not be used in static mode for now."
return _send_in_static_mode(
tensor, dst, group, sync_op, use_calc_stream
)
......@@ -22,7 +22,7 @@ from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.dygraph import layers
from paddle.distributed import collective
from ....communication.reduce import ReduceOp
from ....communication.reduce import ReduceOp, _get_reduce_op
from paddle.fluid.data_feeder import check_dtype
import paddle.fluid.dygraph_utils as dygraph_utils
......@@ -61,7 +61,7 @@ def _c_identity(tensor, group=None):
@staticmethod
def backward(ctx, dy):
op_type = collective._get_reduce_op(ReduceOp.SUM, "_c_identity")
op_type = _get_reduce_op(ReduceOp.SUM, "_c_identity")
group.process_group.allreduce_on_calc_stream(dy, op_type)
return dy
......@@ -254,7 +254,7 @@ def _mp_allreduce(
ctx.ring_id = group.id
if use_calc_stream:
op_type = collective._get_reduce_op(op, "_mp_allreduce")
op_type = _get_reduce_op(op, "_mp_allreduce")
group.process_group.allreduce_on_calc_stream(
tensor, op_type
)
......
......@@ -27,13 +27,13 @@ import numpy as np
from collections import OrderedDict
import paddle
import paddle.distributed as dist
from paddle.fluid import core
from paddle.optimizer import Optimizer
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.distributed.collective import (
_get_global_group,
new_group,
broadcast,
wait,
)
......@@ -169,7 +169,7 @@ class ShardingOptimizerStage2(Optimizer):
"""
for p in self._local_params:
broadcast(
dist.broadcast(
p, src=self._global_root_rank, group=self.group, sync_op=True
)
......@@ -456,7 +456,7 @@ class ShardingOptimizerStage2(Optimizer):
# Exchange all the shards with the other ranks
for dtype_per_rank in self.param_storages.values():
for dst_rank, internal_storage in dtype_per_rank.items():
broadcast(
dist.broadcast(
tensor=internal_storage.buffer,
src=self.group.ranks[dst_rank],
group=self.group,
......
......@@ -28,6 +28,7 @@ import warnings
from collections import OrderedDict
import paddle
import paddle.distributed as dist
from paddle.fluid import core
from paddle.optimizer import Optimizer
from paddle.fluid.clip import ClipGradByGlobalNorm
......@@ -38,7 +39,6 @@ HybridParallelClipGrad = (
)
from paddle.distributed.collective import (
_get_global_group,
broadcast,
new_group,
)
......@@ -206,12 +206,12 @@ class GroupShardedOptimizerStage2(Optimizer):
"""
for p in self._local_params:
broadcast(
dist.broadcast(
p, src=self._global_root_rank, group=self._group, sync_op=True
)
if self._dp_group:
broadcast(
dist.broadcast(
p,
src=self._dp_group.ranks[0],
group=self._dp_group,
......@@ -562,7 +562,7 @@ class GroupShardedOptimizerStage2(Optimizer):
else:
for dtype_per_rank in self.param_storages.values():
for dst_rank, internal_storage in dtype_per_rank.items():
broadcast(
dist.broadcast(
tensor=internal_storage.buffer,
src=self._group.ranks[dst_rank],
group=self._group,
......@@ -590,7 +590,7 @@ class GroupShardedOptimizerStage2(Optimizer):
if x.trainable:
group = self._broadcast_groups[group_idx]
group_idx = (group_idx + 1) % self._number_of_broadcast_groups
task = broadcast(
task = dist.broadcast(
tensor=x,
src=group.ranks[self._param2rank[x.name]],
group=group,
......
......@@ -27,6 +27,7 @@ from functools import reduce
from types import MethodType
import paddle
import paddle.distributed as dist
from paddle import nn
from paddle.distributed import collective
from paddle.distributed.utils.log_utils import get_logger
......@@ -324,12 +325,12 @@ class GroupShardedStage2(nn.Layer):
"""
for buffer in self._layer.buffers(include_sublayers=True):
collective.broadcast(
dist.broadcast(
buffer, self._global_root_rank, self._group, sync_op=True
)
if self._dp_group and self._dp_group.nranks > 1:
collective.broadcast(
dist.broadcast(
buffer,
self._dp_group.ranks[0],
self._dp_group,
......@@ -402,7 +403,7 @@ class GroupShardedStage2(nn.Layer):
# Synchronize the reduce parameter gradient asynchronize
self._sharding_optimizers[0]._update_task(
collective.reduce(
dist.reduce(
tensor=param.grad,
dst=self._group.ranks[dst_rank],
group=self._group,
......@@ -415,7 +416,7 @@ class GroupShardedStage2(nn.Layer):
not self._reduce_overlap
), 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
# TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
collective.all_reduce(
dist.all_reduce(
tensor=param.grad,
group=self._dp_group,
sync_op=True,
......@@ -469,7 +470,7 @@ class GroupShardedStage2(nn.Layer):
grad_storage.sent = True
# Synchronize the reduce parameter gradient asynchronize
self._sharding_optimizers[0]._update_task(
collective.reduce(
dist.reduce(
tensor=grad_storage.buffer,
dst=self._group.ranks[grad_storage.destination],
group=self._group,
......@@ -482,7 +483,7 @@ class GroupShardedStage2(nn.Layer):
not self._reduce_overlap
), 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
# TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
collective.all_reduce(
dist.all_reduce(
tensor=grad_storage.buffer,
group=self._dp_group,
sync_op=True,
......
......@@ -18,6 +18,7 @@ from types import MethodType
from collections import OrderedDict
import paddle
import paddle.distributed as dist
from paddle import nn
from paddle.autograd import PyLayer
import paddle.fluid.core as core
......@@ -196,7 +197,7 @@ class GroupShardedStage3(nn.Layer):
"""
for p in self._layer.parameters():
collective.broadcast(
dist.broadcast(
p, src=self._global_root_rank, group=self._group, sync_op=True
)
......@@ -493,7 +494,7 @@ class GroupShardedStage3(nn.Layer):
"""
for buffer in self._layer.buffers(include_sublayers=True):
collective.broadcast(
dist.broadcast(
buffer, self._global_root_rank, self._group, sync_op=True
)
......@@ -536,7 +537,7 @@ class GroupShardedStage3(nn.Layer):
# 2.Handle unslice param
for grad_storage in self._grad_storages.values():
grad_storage.buffer.scale_(scale=self._world_size_scaling)
collective.all_reduce(tensor=grad_storage.buffer, group=self._group)
dist.all_reduce(tensor=grad_storage.buffer, group=self._group)
if self._offload:
for param in list(self._unslice_params):
param._clear_data()
......@@ -600,7 +601,7 @@ class GroupShardedStage3(nn.Layer):
if param.name in self._task_flow.full_grad.keys():
full_grad = self._task_flow.full_grad[param.name]
# Only support sync allreduce current rank's layer now
collective.all_reduce(tensor=full_grad, group=self._group)
dist.all_reduce(tensor=full_grad, group=self._group)
start, end = self._param2buffer[param.name][self._rank]
if param.bw_storage is None:
......
......@@ -30,8 +30,9 @@ from collections import deque
from types import MethodType
import paddle
import paddle.distributed as dist
from paddle import nn
from paddle.distributed import collective as dist
from paddle.distributed import collective as collective
from paddle.distributed.collective import _get_global_group
from ...utils.internal_storage import GradStorage
......@@ -92,7 +93,7 @@ class ShardingStage2(nn.Layer):
# Communication related attributes
self._group = (
dist.new_group(_get_global_group().ranks)
collective.new_group(_get_global_group().ranks)
if group is None
else group
)
......@@ -317,7 +318,7 @@ class ShardingStage2(nn.Layer):
buffer, self._global_root_rank, self._group, sync_op=True
)
# Multi stream operation will be supported later
dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
collective.wait(tensor=buffer, group=self._group, use_calc_stream=True)
def __getattr__(self, name):
"""Forward missing attributes to wrapped layer."""
......@@ -381,7 +382,7 @@ class ShardingStage2(nn.Layer):
)
# Multi stream operation will be supported later
dist.wait(
collective.wait(
tensor=param.grad,
group=self._group,
use_calc_stream=True,
......@@ -447,7 +448,7 @@ class ShardingStage2(nn.Layer):
)
# Multi stream operation will be supported later
dist.wait(
collective.wait(
tensor=grad_storage.buffer,
group=self._group,
use_calc_stream=True,
......
......@@ -18,12 +18,13 @@ from types import MethodType
from collections import OrderedDict
import paddle
import paddle.distributed as dist
from paddle import nn
from paddle.autograd import PyLayer
import paddle.fluid.core as core
from paddle.fluid.framework import ParamBase
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.distributed import collective as dist
from paddle.distributed import collective
from paddle.distributed.collective import _get_global_group
from .sharding_utils import Type, ShardingClipGrad, device_guard
......@@ -101,7 +102,7 @@ class ShardingStage3(nn.Layer):
# Communication group establishment
self._group = (
dist.new_group(_get_global_group().ranks)
collective.new_group(_get_global_group().ranks)
if group is None
else group
)
......@@ -183,7 +184,7 @@ class ShardingStage3(nn.Layer):
)
# Multi stream operation will be supported later
dist.wait(tensor=p, group=self._group, use_calc_stream=True)
collective.wait(tensor=p, group=self._group, use_calc_stream=True)
def _clear_gradients(self):
assert len(self._trainable_params.keys()) > 0
......@@ -484,7 +485,7 @@ class ShardingStage3(nn.Layer):
buffer, self._global_root_rank, self._group, sync_op=True
)
# Multi stream operation will be supported later
dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
collective.wait(tensor=buffer, group=self._group, use_calc_stream=True)
def __getattr__(self, name):
"""Forward missing attributes to wrapped layer."""
......@@ -528,7 +529,7 @@ class ShardingStage3(nn.Layer):
dist.all_reduce(
tensor=grad_storage.buffer, group=self._group, sync_op=True
)
dist.wait(
collective.wait(
tensor=grad_storage.buffer,
group=self._group,
use_calc_stream=True,
......@@ -600,7 +601,7 @@ class ShardingStage3(nn.Layer):
dist.all_reduce(
tensor=full_grad, group=self._group, sync_op=True
)
dist.wait(
collective.wait(
tensor=full_grad, group=self._group, use_calc_stream=True
)
......@@ -945,7 +946,7 @@ def _allgather_buffer(
# Allgather current layer in the 1st step synchronously
if sync_wait:
with paddle.amp.auto_cast(enable=False):
dist.wait(
collective.wait(
tensor=full_param,
group=group,
use_calc_stream=use_calc_stream,
......
......@@ -17,6 +17,7 @@ import unittest
import paddle
import numpy as np
import paddle.distributed as dist
from paddle.distributed.communication.reduce_scatter import _reduce_scatter_base
class TestCollectiveReduceScatter(unittest.TestCase):
......@@ -75,9 +76,7 @@ class TestCollectiveReduceScatter(unittest.TestCase):
# [1, 2, 3, 4] # Rank-1
output = paddle.empty(shape=[2], dtype=input.dtype)
task = paddle.distributed.collective._reduce_scatter_base(
output, input, sync_op=False
)
task = _reduce_scatter_base(output, input, sync_op=False)
task.wait()
......
......@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.distributed as dist
from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid import core, layers
from paddle.distributed import collective
class ClipGradForMOEByGlobalNorm(ClipGradBase):
......@@ -185,9 +185,9 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
moe_params_grads, sum_dtype
)
if global_norm_var_moe is not None:
collective.all_reduce(
dist.all_reduce(
global_norm_var_moe,
op=collective.ReduceOp.SUM,
op=dist.ReduceOp.SUM,
group=self.moe_group,
)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册