Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
99504cbb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
99504cbb
编写于
11月 04, 2022
作者:
L
LiYuRio
提交者:
GitHub
11月 04, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move broadcast, reduce, send, recv, reduce_scatter, scatter, alltoall (#47255)
上级
ef67c8a8
变更
34
展开全部
隐藏空白更改
内联
并排
Showing
34 changed file
with
1960 addition
and
1586 deletion
+1960
-1586
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+24
-0
paddle/fluid/distributed/collective/ProcessGroupGloo.h
paddle/fluid/distributed/collective/ProcessGroupGloo.h
+18
-0
python/paddle/distributed/__init__.py
python/paddle/distributed/__init__.py
+20
-19
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+1
-1259
python/paddle/distributed/communication/__init__.py
python/paddle/distributed/communication/__init__.py
+30
-0
python/paddle/distributed/communication/all_reduce.py
python/paddle/distributed/communication/all_reduce.py
+3
-1
python/paddle/distributed/communication/all_to_all.py
python/paddle/distributed/communication/all_to_all.py
+161
-0
python/paddle/distributed/communication/batch_isend_irecv.py
python/paddle/distributed/communication/batch_isend_irecv.py
+177
-0
python/paddle/distributed/communication/broadcast.py
python/paddle/distributed/communication/broadcast.py
+85
-0
python/paddle/distributed/communication/group.py
python/paddle/distributed/communication/group.py
+133
-0
python/paddle/distributed/communication/recv.py
python/paddle/distributed/communication/recv.py
+111
-0
python/paddle/distributed/communication/reduce.py
python/paddle/distributed/communication/reduce.py
+115
-4
python/paddle/distributed/communication/reduce_scatter.py
python/paddle/distributed/communication/reduce_scatter.py
+122
-0
python/paddle/distributed/communication/scatter.py
python/paddle/distributed/communication/scatter.py
+94
-0
python/paddle/distributed/communication/send.py
python/paddle/distributed/communication/send.py
+110
-0
python/paddle/distributed/communication/stream/__init__.py
python/paddle/distributed/communication/stream/__init__.py
+1
-2
python/paddle/distributed/communication/stream/all_reduce.py
python/paddle/distributed/communication/stream/all_reduce.py
+9
-7
python/paddle/distributed/communication/stream/all_to_all.py
python/paddle/distributed/communication/stream/all_to_all.py
+359
-0
python/paddle/distributed/communication/stream/alltoall_single.py
...addle/distributed/communication/stream/alltoall_single.py
+0
-144
python/paddle/distributed/communication/stream/broadcast.py
python/paddle/distributed/communication/stream/broadcast.py
+64
-17
python/paddle/distributed/communication/stream/recv.py
python/paddle/distributed/communication/stream/recv.py
+54
-18
python/paddle/distributed/communication/stream/reduce.py
python/paddle/distributed/communication/stream/reduce.py
+61
-13
python/paddle/distributed/communication/stream/reduce_scatter.py
...paddle/distributed/communication/stream/reduce_scatter.py
+9
-10
python/paddle/distributed/communication/stream/scatter.py
python/paddle/distributed/communication/stream/scatter.py
+104
-35
python/paddle/distributed/communication/stream/send.py
python/paddle/distributed/communication/stream/send.py
+53
-18
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+3
-3
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+3
-3
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
.../meta_parallel/sharding/group_sharded_optimizer_stage2.py
+5
-5
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage2.py
+7
-6
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage3.py
+5
-4
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
...stributed/fleet/meta_parallel/sharding/sharding_stage2.py
+6
-5
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
...stributed/fleet/meta_parallel/sharding/sharding_stage3.py
+8
-7
python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
...d/tests/unittests/collective/collective_reduce_scatter.py
+2
-3
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+3
-3
未找到文件。
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
浏览文件 @
99504cbb
...
...
@@ -233,6 +233,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
opts
)
{
return
Broadcast
(
inputs
,
outputs
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
opts
,
bool
sync_op
)
{
auto
root
=
opts
.
source_rank
;
std
::
unique_ptr
<
BroadcastGlooTask
>
task
;
auto
tag
=
next_tag
();
...
...
@@ -442,6 +450,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Reduce(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
ReduceOptions
&
opts
)
{
return
Reduce
(
inputs
,
outputs
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
std
::
shared_ptr
<
ReduceGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
...
...
@@ -497,6 +513,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Scatter(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
return
Scatter
(
in_tensors
,
out_tensors
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
std
::
shared_ptr
<
ScatterGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.h
浏览文件 @
99504cbb
...
...
@@ -113,6 +113,12 @@ class ProcessGroupGloo : public ProcessGroup {
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
BroadcastOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
...
...
@@ -131,11 +137,23 @@ class ProcessGroupGloo : public ProcessGroup {
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
...
...
python/paddle/distributed/__init__.py
浏览文件 @
99504cbb
...
...
@@ -27,31 +27,33 @@ from paddle.distributed.fleet.dataset import InMemoryDataset # noqa: F401
from
paddle.distributed.fleet.dataset
import
QueueDataset
# noqa: F401
from
paddle.distributed.fleet.base.topology
import
ParallelMode
# noqa: F401
from
.collective
import
broadcast
# noqa: F401
from
.collective
import
all_reduce
# noqa: F401
from
.collective
import
reduce
# noqa: F401
from
.collective
import
all_gather
# noqa: F401
from
.collective
import
all_gather_object
# noqa: F401
from
.collective
import
scatter
# noqa: F401
from
.collective
import
barrier
# noqa: F401
from
.collective
import
ReduceOp
# noqa: F401
from
.collective
import
split
# noqa: F401
from
.collective
import
new_group
# noqa: F401
from
.collective
import
alltoall
# noqa: F401
from
.collective
import
recv
# noqa: F401
from
.collective
import
get_group
# noqa: F401
from
.collective
import
send
# noqa: F401
from
.collective
import
wait
# noqa: F401
from
.collective
import
is_initialized
# noqa: F401
from
.collective
import
destroy_process_group
# noqa: F401
from
.collective
import
alltoall_single
# noqa: F401
from
.collective
import
isend
# noqa: F401
from
.collective
import
irecv
# noqa: F401
from
.collective
import
batch_isend_irecv
# noqa: F401
from
.collective
import
P2POp
# noqa: F401
from
.collective
import
reduce_scatter
# noqa: F401
from
.communication
import
stream
# noqa: F401
from
.communication
import
(
stream
,
ReduceOp
,
all_reduce
,
alltoall
,
alltoall_single
,
broadcast
,
reduce
,
send
,
scatter
,
isend
,
recv
,
irecv
,
batch_isend_irecv
,
P2POp
,
reduce_scatter
,
is_initialized
,
destroy_process_group
,
get_group
,
)
# noqa: F401
from
.auto_parallel
import
shard_op
# noqa: F401
from
.auto_parallel
import
shard_tensor
# noqa: F401
...
...
@@ -109,5 +111,4 @@ __all__ = [ # noqa
"irecv"
,
"reduce_scatter"
,
"rpc"
,
"stream"
,
]
python/paddle/distributed/collective.py
浏览文件 @
99504cbb
此差异已折叠。
点击以展开。
python/paddle/distributed/communication/__init__.py
浏览文件 @
99504cbb
...
...
@@ -11,3 +11,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.all_reduce
import
all_reduce
from
.broadcast
import
broadcast
from
.reduce
import
reduce
,
ReduceOp
from
.send
import
send
,
isend
from
.recv
import
recv
,
irecv
from
.scatter
import
scatter
from
.batch_isend_irecv
import
batch_isend_irecv
,
P2POp
from
.reduce_scatter
import
reduce_scatter
from
.all_to_all
import
alltoall
,
alltoall_single
from
.group
import
is_initialized
,
destroy_process_group
,
get_group
__all__
=
[
"ReduceOp"
,
"all_reduce"
,
"alltoall"
,
"alltoall_single"
,
"broadcast"
,
"reduce"
,
"send"
,
"scatter"
,
"isend"
,
"recv"
,
"irecv"
,
"batch_isend_irecv"
,
"P2POp"
,
"reduce_scatter"
,
"is_initialized"
,
"destroy_process_group"
,
"get_group"
,
]
python/paddle/distributed/communication/all_reduce.py
浏览文件 @
99504cbb
...
...
@@ -14,7 +14,7 @@
import
paddle
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication
import
stream
as
stream
import
paddle.distributed.communication.
stream
as
stream
from
paddle.distributed.communication.reduce
import
ReduceOp
...
...
@@ -63,6 +63,8 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
ring_id
=
0
if
group
is
None
else
group
.
id
if
op
==
ReduceOp
.
SUM
:
...
...
python/paddle/distributed/communication/all_to_all.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
def
alltoall
(
in_tensor_list
,
out_tensor_list
,
group
=
None
,
sync_op
=
True
):
"""
Scatter tensors in in_tensor_list to all participators averagely and gather the result tensors in out_tensor_list.
As shown below, the in_tensor_list in GPU0 includes 0_0 and 0_1, and GPU1 includes 1_0 and 1_1.
Through alltoall operator, the 0_0 in GPU0 will be sent to GPU0 and 0_1 to GPU1, 1_0 in GPU1 sent to GPU0 and 1_1 to GPU1.
Finally the out_tensor_list in GPU0 includes 0_0 and 1_0, and GPU1 includes 0_1 and 1_1.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/alltoall.png
:width: 800
:alt: alltoall
:align: center
Args:
in_tensor_list (List[Tensor]): List of tensors to scatter one per rank. The data type of each tensor
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor_list (List[Tensor]): List of tensors to be gathered one per rank. The data type of each tensor should be the same as the input tensors.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
out_tensor_list = []
if dist.get_rank() == 0:
data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
else:
data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]])
data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]])
dist.alltoall([data1, data2], out_tensor_list)
print(out_tensor_list)
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
alltoall
(
out_tensor_list
,
in_tensor_list
,
group
,
sync_op
,
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
temp
=
paddle
.
concat
(
in_tensor_list
,
axis
=
0
)
nranks
=
len
(
in_tensor_list
)
use_calc_stream
=
sync_op
out
=
paddle
.
_legacy_C_ops
.
alltoall
(
temp
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
out_tensor_list
.
extend
(
paddle
.
split
(
out
,
nranks
,
0
))
def
alltoall_single
(
in_tensor
,
out_tensor
,
in_split_sizes
=
None
,
out_split_sizes
=
None
,
group
=
None
,
sync_op
=
True
,
):
"""
Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
Note:
``alltoall_single`` is only supported in eager mode.
Args:
in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
size = dist.get_world_size()
# case 1 (2 GPUs)
data = paddle.arange(2, dtype='int64') + rank * 2
# data for rank 0: [0, 1]
# data for rank 1: [2, 3]
output = paddle.empty([2], dtype='int64')
dist.alltoall_single(data, output)
print(output)
# output for rank 0: [0, 2]
# output for rank 1: [1, 3]
# case 2 (2 GPUs)
in_split_sizes = [i + 1 for i in range(size)]
# in_split_sizes for rank 0: [1, 2]
# in_split_sizes for rank 1: [1, 2]
out_split_sizes = [rank + 1 for i in range(size)]
# out_split_sizes for rank 0: [1, 1]
# out_split_sizes for rank 1: [2, 2]
data = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
# data for rank 0: [[0., 0.], [0., 0.], [0., 0.]]
# data for rank 1: [[1., 1.], [1., 1.], [1., 1.]]
output = paddle.empty([(rank + 1) * size, size], dtype='float32')
group = dist.new_group([0, 1])
task = dist.alltoall_single(data,
output,
in_split_sizes,
out_split_sizes,
sync_op=False,
group=group)
task.wait()
print(output)
# output for rank 0: [[0., 0.], [1., 1.]]
# output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
alltoall_single
(
out_tensor
,
in_tensor
,
out_split_sizes
,
in_split_sizes
,
group
,
sync_op
,
False
,
)
python/paddle/distributed/communication/batch_isend_irecv.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
contextlib
import
paddle.distributed
as
dist
import
paddle.fluid.core
as
core
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
)
class
P2POp
(
object
):
"""
A class that makes point-to-point operations for "batch_isend_irecv".
This class creates the type of P2P operation, communication buffer, peer rank,
Group. Instances of this class will be passed to
``paddle.distributed.batch_isend_irecv`` for point-to-point communication.
Args:
op (callable): A function to send data to or receive data from a peer process.
The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
tensor (Tensor): Tensor to send or receive.
peer (int): The destination or source rank.
group (Group, optional): The group instance return by new_group or None for global
default group. Default: None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
"""
def
__init__
(
self
,
op
,
tensor
,
peer
,
group
=
None
):
if
op
not
in
[
dist
.
isend
,
dist
.
irecv
]:
raise
RuntimeError
(
"Invalid ``op`` function. Expected ``op`` "
"to be of type ``paddle.distributed.isend`` or "
"``paddle.distributed.irecv``."
)
self
.
op
=
op
self
.
tensor
=
tensor
self
.
peer
=
peer
self
.
group
=
_get_global_group
()
if
group
is
None
else
group
@
contextlib
.
contextmanager
def
_with_batch_p2p_guard
(
backend
):
if
backend
==
"NCCL"
:
core
.
ProcessGroupNCCL
.
group_start
()
try
:
yield
finally
:
if
backend
==
"NCCL"
:
core
.
ProcessGroupNCCL
.
group_end
()
def
_check_p2p_op_list
(
p2p_op_list
):
"""
Helper to check that the ``p2p_op_list`` is a list of P2POp instances and
all ops use the same backend.
"""
if
not
isinstance
(
p2p_op_list
,
list
)
or
not
all
(
isinstance
(
p2p_op
,
P2POp
)
for
p2p_op
in
p2p_op_list
):
raise
RuntimeError
(
"Invalid ``p2p_op_list``. Each op is expected to "
"to be of type ``paddle.distributed.P2POp``."
)
backend
=
p2p_op_list
[
0
].
group
.
backend
if
not
all
(
backend
==
p2p_op
.
group
.
backend
for
p2p_op
in
p2p_op_list
):
raise
RuntimeError
(
"All groups need to use the same backend."
)
def
batch_isend_irecv
(
p2p_op_list
):
"""
Send or Receive a batch of tensors asynchronously and return a list of requests.
Process each of the point-to-point operations in ``p2p_op_list`` and return the
corresponding tasks. NCCL are currently supported.
Args:
p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
``paddle.distributed.P2POp``). The order of the isend/irecv in the list
matters and it needs to match with corresponding isend/irecv on the
remote end.
Returns:
A list of distributed tasks returned by calling the corresponding
op in the op_list.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
tasks = dist.batch_isend_irecv([send_op, recv_op])
for task in tasks:
task.wait()
print(recv_t)
# paddle.tensor([1, 2]) # Rank-0
# paddle.tensor([0, 1]) # Rank-1
"""
_check_p2p_op_list
(
p2p_op_list
)
group
=
p2p_op_list
[
0
].
group
if
_warn_cur_rank_not_in_group
(
group
):
return
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
backend
=
group
.
backend
tasks
=
[]
with
_with_batch_p2p_guard
(
backend
):
for
p2p_op
in
p2p_op_list
:
op
=
p2p_op
.
op
tensor
=
p2p_op
.
tensor
peer
=
p2p_op
.
peer
comm_group
=
p2p_op
.
group
task
=
op
(
tensor
,
peer
,
comm_group
)
if
task
is
not
None
:
tasks
.
append
(
task
)
return
tasks
else
:
raise
RuntimeError
(
"Don't support static graph mode currently."
)
python/paddle/distributed/communication/broadcast.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
def
broadcast
(
tensor
,
src
,
group
=
None
,
sync_op
=
True
):
"""
Broadcast a tensor from the source to all others.
As shown below, one process is started with a GPU and GPU0 owns data 0. Through broadcast operator,
data 0 will be sent to all GPUs from GPU0.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/broadcast.png
:width: 800
:alt: broadcast
:align: center
Args:
tensor (Tensor): The tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank in global view.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.broadcast(data, src=1)
print(data)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
broadcast
(
tensor
,
src
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
,
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
ring_id
=
0
if
group
is
None
else
group
.
id
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
assert
gsrc
>=
0
,
"src rank out of group, need global rank"
return
paddle
.
_legacy_C_ops
.
c_broadcast
(
tensor
,
tensor
,
'root'
,
gsrc
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
)
python/paddle/distributed/communication/group.py
浏览文件 @
99504cbb
...
...
@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
warnings
import
paddle.distributed
as
dist
class
Group
:
"""
...
...
@@ -50,6 +53,10 @@ class Group:
def
world_size
(
self
):
return
self
.
_world_size
@
property
def
backend
(
self
):
return
self
.
_pg
.
name
()
@
property
def
id
(
self
):
return
self
.
_id
...
...
@@ -94,3 +101,129 @@ def _add_new_group(group):
"The group with id {} already exist."
.
format
(
group
.
id
)
)
_GroupManager
.
group_map_by_id
[
group
.
id
]
=
group
def
_is_global_group
(
group
):
return
group
.
id
==
_GroupManager
.
global_group_id
def
_warn_cur_rank_not_in_group
(
group
):
global_rank
=
dist
.
get_rank
()
if
group
and
not
group
.
is_member
():
warnings
.
warn
(
"Current global rank {} is not in group {}"
.
format
(
global_rank
,
group
.
name
)
)
return
True
return
False
def
_get_or_throw_group_rank
(
global_rank
,
group
):
group_rank
=
group
.
get_group_rank
(
global_rank
)
assert
(
group_rank
>=
0
),
"The input rank {} can not be found inside the group {}"
.
format
(
global_rank
,
group
.
name
)
return
group_rank
def
is_initialized
():
"""
Check whether the distributed environment has been initialized
Returns:
`True` if distributed environment has been initialized, otherwise `False`.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
print(paddle.distributed.is_initialized())
# False
paddle.distributed.init_parallel_env()
print(paddle.distributed.is_initialized())
# True
"""
return
_GroupManager
.
global_group_id
in
_GroupManager
.
group_map_by_id
def
destroy_process_group
(
group
=
None
):
"""
Destroy a given group for communication
Args:
group (Group, optional): The group to be destroyed. All of process groups, including
the default group, will be destroyed and the distributed
environment will be deinitialized.
Returns : None
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
group = dist.new_group([0, 1])
dist.destroy_process_group(group)
print(dist.is_initialized())
# True
dist.destroy_process_group()
print(dist.is_initialized())
# False
"""
group
=
_get_global_group
()
if
group
is
None
else
group
assert
(
group
.
id
in
_GroupManager
.
group_map_by_id
),
"Destroy group with id {} is invalid."
.
format
(
group
.
id
)
if
_is_global_group
(
group
):
_GroupManager
.
group_map_by_id
.
clear
()
else
:
del
_GroupManager
.
group_map_by_id
[
group
.
id
]
def
get_group
(
id
=
0
):
"""
Get group instance by group id.
Args:
id (int): the group id. Default value is 0.
Returns:
Group: the group instance.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
gid = paddle.distributed.new_group([2,4,6])
paddle.distributed.get_group(gid.id)
"""
if
id
in
_GroupManager
.
group_map_by_id
:
return
_GroupManager
.
group_map_by_id
[
id
]
warnings
.
warn
(
"Group {} is not initialized."
.
format
(
id
))
return
None
python/paddle/distributed/communication/recv.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
def
recv
(
tensor
,
src
=
0
,
group
=
None
,
sync_op
=
True
):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
recv
(
tensor
,
src
=
src
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
ring_id
=
0
if
group
is
None
else
group
.
id
return
paddle
.
_legacy_C_ops
.
recv_v2
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
src
,
'dtype'
,
tensor
.
dtype
,
'out_shape'
,
tensor
.
shape
,
)
def
irecv
(
tensor
,
src
=
None
,
group
=
None
):
"""
Receive a tensor to the sender.
Args:
tensor (Tensor): The Tensor to receive. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
src (int): The source rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
return
recv
(
tensor
,
src
,
group
,
sync_op
=
False
)
python/paddle/distributed/communication/reduce.py
浏览文件 @
99504cbb
...
...
@@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.fluid.core
as
core
import
paddle.distributed.communication.stream
as
stream
class
ReduceOp
:
...
...
@@ -66,12 +68,121 @@ def _get_reduce_op(reduce_op, func_name):
return
core
.
ReduceOp
.
PRODUCT
else
:
if
reduce_op
==
ReduceOp
.
SUM
:
return
'c_
allreduce_sum'
return
'c_
{}_sum'
.
format
(
func_name
)
elif
reduce_op
==
ReduceOp
.
MAX
:
return
'c_
allreduce_max'
return
'c_
{}_max'
.
format
(
func_name
)
elif
reduce_op
==
ReduceOp
.
MIN
:
return
'c_
allreduce_min'
return
'c_
{}_min'
.
format
(
func_name
)
elif
reduce_op
==
ReduceOp
.
PROD
:
return
'c_allreduce_prod'
return
'c_{}_prod'
.
format
(
func_name
)
else
:
return
'c_{}'
.
format
(
func_name
)
raise
ValueError
(
"Unknown reduce_op type for {}."
.
format
(
func_name
))
def
reduce
(
tensor
,
dst
,
op
=
ReduceOp
.
SUM
,
group
=
None
,
sync_op
=
True
):
"""
Reduce a tensor to the destination from all others. As shown below, one process is started with a GPU and the data of this process is represented
by its group rank. The destination of the reduce operator is GPU0 and the process is sum. Through reduce operator,
the GPU0 will owns the sum of all data from all GPUs.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/reduce.png
:width: 800
:alt: reduce
:align: center
Args:
tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
dist.reduce(data, dst=0)
print(data)
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
reduce
(
tensor
,
dst
=
dst
,
op
=
op
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
,
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
ring_id
=
0
if
group
is
None
else
group
.
id
gdst
=
dst
if
group
is
None
else
group
.
get_group_rank
(
dst
)
assert
gdst
>=
0
,
"dst rank out of group, need global rank"
if
op
==
ReduceOp
.
SUM
:
return
paddle
.
_legacy_C_ops
.
c_reduce_sum
(
tensor
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'root_id'
,
gdst
,
)
elif
op
==
ReduceOp
.
MAX
:
return
paddle
.
_legacy_C_ops
.
c_reduce_max
(
tensor
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'root_id'
,
gdst
,
)
elif
op
==
ReduceOp
.
MIN
:
return
paddle
.
_legacy_C_ops
.
c_reduce_min
(
tensor
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'root_id'
,
gdst
,
)
elif
op
==
ReduceOp
.
PROD
:
return
paddle
.
_legacy_C_ops
.
c_reduce_prod
(
tensor
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'root_id'
,
gdst
,
)
else
:
raise
ValueError
(
"Unknown parameter: {}."
.
format
(
op
))
python/paddle/distributed/communication/reduce_scatter.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
from
paddle.distributed.communication.reduce
import
ReduceOp
from
paddle.distributed.communication.stream.reduce_scatter
import
(
_reduce_scatter_base
as
_reduce_scatter_base_stream
,
)
def
reduce_scatter
(
tensor
,
tensor_list
,
op
=
ReduceOp
.
SUM
,
group
=
None
,
sync_op
=
True
):
"""
Reduces, then scatters a list of tensors to all processes in a group
Args:
tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
tensor_list (List[Tensor]]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([0, 1])
data2 = paddle.to_tensor([2, 3])
else:
data1 = paddle.to_tensor([4, 5])
data2 = paddle.to_tensor([6, 7])
dist.reduce_scatter(data1, [data1, data2])
print(data1)
# [4, 6] (2 GPUs, out for rank 0)
# [8, 10] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
reduce_scatter
(
tensor
,
tensor_list
,
op
=
op
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
,
)
def
_reduce_scatter_base
(
output
,
input
,
op
=
ReduceOp
.
SUM
,
group
=
None
,
sync_op
=
True
):
"""
Reduces, then scatters a flattened tensor to all processes in a group.
Args:
output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Async task handle, if sync_op is set to False.
None, if sync_op or if not part of the group.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
rank = dist.get_rank()
data = paddle.arange(4) + rank
# [0, 1, 2, 3] (2 GPUs, for rank 0)
# [1, 2, 3, 4] (2 GPUs, for rank 1)
output = paddle.empty(shape=[2], dtype=data.dtype)
dist.collective._reduce_scatter_base(output, data)
print(output)
# [1, 3] (2 GPUs, out for rank 0)
# [5, 7] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
_reduce_scatter_base_stream
(
output
,
input
,
op
=
op
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
,
)
python/paddle/distributed/communication/scatter.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
from
paddle.distributed.communication.group
import
_get_global_group
def
scatter
(
tensor
,
tensor_list
=
None
,
src
=
0
,
group
=
None
,
sync_op
=
True
):
"""
Scatter a tensor to all participators. As shown below, one process is started with a GPU and the source of the scatter
is GPU0. Through scatter operator, the data in GPU0 will be sent to all GPUs averagely.
.. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/scatter.png
:width: 800
:alt: scatter
:align: center
Args:
tensor (Tensor): The output Tensor. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
src (int): The source rank id. Default value is 0.
group (Group, optional): The group instance return by new_group or None for global default group.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
None.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data1 = paddle.to_tensor([7, 8, 9])
data2 = paddle.to_tensor([10, 11, 12])
dist.scatter(data1, src=1)
else:
data1 = paddle.to_tensor([1, 2, 3])
data2 = paddle.to_tensor([4, 5, 6])
dist.scatter(data1, tensor_list=[data1, data2], src=1)
print(data1, data2)
# [1, 2, 3] [10, 11, 12] (2 GPUs, out for rank 0)
# [4, 5, 6] [4, 5, 6] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
scatter
(
tensor
,
tensor_list
,
src
,
group
,
sync_op
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
rank
=
_get_global_group
().
rank
if
group
is
None
else
group
.
rank
nranks
=
_get_global_group
().
nranks
if
group
is
None
else
group
.
nranks
assert
gsrc
>=
0
,
"src rank out of group, need global rank"
if
rank
!=
gsrc
:
tensor_list
=
[]
for
_
in
range
(
nranks
):
tensor_list
.
append
(
tensor
)
temp
=
paddle
.
concat
(
tensor_list
,
axis
=
0
)
use_calc_stream
=
sync_op
return
framework
.
_legacy_C_ops
.
c_scatter
(
temp
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'nranks'
,
nranks
,
'root'
,
gsrc
,
)
python/paddle/distributed/communication/send.py
0 → 100644
浏览文件 @
99504cbb
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid.framework
as
framework
import
paddle.distributed.communication.stream
as
stream
def
send
(
tensor
,
dst
=
0
,
group
=
None
,
sync_op
=
True
):
"""
Send a tensor to the receiver.
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank id.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
sync_op (bool, optional): Whether this op is a sync op. The default value is True.
Returns:
Return a task object.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
dist.send(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
dist.recv(data, src=0)
print(data)
# [7, 8, 9] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
send
(
tensor
,
dst
=
dst
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
gdst
=
dst
if
group
is
None
else
group
.
get_group_rank
(
dst
)
assert
gdst
>=
0
,
"dst rank out of group, need global rank"
ring_id
=
0
if
group
is
None
else
group
.
id
return
paddle
.
_legacy_C_ops
.
send_v2
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
gdst
,
)
def
isend
(
tensor
,
dst
,
group
=
None
):
"""
Send tensor asynchronously
Args:
tensor (Tensor): The Tensor to send. Its data type
should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
dst (int): The destination rank.
group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
if dist.get_rank() == 0:
data = paddle.to_tensor([7, 8, 9])
task = dist.isend(data, dst=1)
else:
data = paddle.to_tensor([1, 2, 3])
task = dist.irecv(data, src=0)
task.wait()
print(data)
# [7, 8, 9] (2 GPUs)
"""
return
send
(
tensor
,
dst
,
group
,
sync_op
=
False
)
python/paddle/distributed/communication/stream/__init__.py
浏览文件 @
99504cbb
...
...
@@ -14,8 +14,7 @@
from
.all_gather
import
all_gather
from
.all_reduce
import
all_reduce
from
.alltoall
import
alltoall
from
.alltoall_single
import
alltoall_single
from
.all_to_all
import
alltoall
,
alltoall_single
from
.broadcast
import
broadcast
from
.reduce
import
reduce
from
.reduce_scatter
import
reduce_scatter
...
...
python/paddle/distributed/communication/stream/all_reduce.py
浏览文件 @
99504cbb
...
...
@@ -16,11 +16,14 @@ import paddle.fluid.framework as framework
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.reduce
import
_get_reduce_op
,
ReduceOp
from
paddle.distributed.communication.group
import
_get_global_group
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
)
def
_all_reduce_in_dygraph
(
tensor
,
op
,
group
,
sync_op
,
use_calc_stream
):
op_type
=
_get_reduce_op
(
op
,
"all
_
reduce"
)
op_type
=
_get_reduce_op
(
op
,
"allreduce"
)
group
=
_get_global_group
()
if
group
is
None
else
group
if
use_calc_stream
:
...
...
@@ -50,7 +53,7 @@ def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream):
'all_reduce'
,
)
op_type
=
_get_reduce_op
(
op
,
"all
_
reduce"
)
op_type
=
_get_reduce_op
(
op
,
"allreduce"
)
ring_id
=
0
if
group
is
None
else
group
.
id
if
not
isinstance
(
ring_id
,
int
):
...
...
@@ -107,10 +110,8 @@ def all_reduce(
out = data.numpy()
# [[5, 7, 9], [5, 7, 9]]
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -122,6 +123,7 @@ def all_reduce(
tensor
,
op
,
group
,
sync_op
,
use_calc_stream
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_all_reduce_in_static_mode
(
tensor
,
op
,
group
,
sync_op
,
use_calc_stream
)
python/paddle/distributed/communication/stream/all
to
all.py
→
python/paddle/distributed/communication/stream/all
_to_
all.py
浏览文件 @
99504cbb
...
...
@@ -14,7 +14,13 @@
import
paddle
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
import
paddle.distributed
as
dist
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
)
def
_check_tensor_shape
(
tensor
,
shape
,
nranks
=
1
):
...
...
@@ -34,10 +40,9 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1):
)
def
_all
to
all_tensor_in_dygraph
(
def
_all
_to_
all_tensor_in_dygraph
(
out_tensor
,
in_tensor
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
_check_tensor_shape
(
out_tensor
,
in_tensor
.
shape
,
group
.
nranks
)
...
...
@@ -53,11 +58,9 @@ def _alltoall_tensor_in_dygraph(
return
task
def
_all
to
all_in_dygraph
(
def
_all
_to_
all_in_dygraph
(
out_tensor_list
,
in_tensor_list
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
if
len
(
in_tensor_list
)
==
0
:
raise
RuntimeError
(
"The input tensor_list should not be empty."
)
...
...
@@ -84,6 +87,59 @@ def _alltoall_in_dygraph(
return
task
def
_all_to_all_in_static_mode
(
out_tensor_or_tensor_list
,
in_tensor_or_tensor_list
,
group
,
sync_op
,
use_calc_stream
,
):
op_type
=
'alltoall'
ring_id
=
0
if
group
is
None
else
group
.
id
nranks
=
dist
.
get_world_size
()
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
in_tensor
=
in_tensor_or_tensor_list
if
isinstance
(
in_tensor_or_tensor_list
,
list
):
if
len
(
in_tensor_or_tensor_list
)
==
0
:
raise
RuntimeError
(
"The input tensor_list should not be empty."
)
in_tensor
=
paddle
.
concat
(
in_tensor_or_tensor_list
,
axis
=
0
)
out_tensor
=
out_tensor_or_tensor_list
if
isinstance
(
out_tensor_or_tensor_list
,
list
):
if
len
(
out_tensor_or_tensor_list
)
!=
0
:
raise
ValueError
(
"The 'out_tensor_list' for all_to_all "
"must be an empty list."
)
out_tensor
=
helper
.
create_variable_for_type_inference
(
dtype
=
in_tensor
.
dtype
)
data_feeder
.
check_variable_and_dtype
(
in_tensor
,
'in_tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
],
'all_to_all'
,
)
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
in_tensor
]},
outputs
=
{
'Out'
:
[
out_tensor
]},
attrs
=
{
'ring_id'
:
ring_id
,
'use_calc_stream'
:
sync_op
,
},
)
# NOTE(liyurui): If the argument `out_tensor_or_tensor_list` is a tensor_list,
# we need to split the result. So we should wait the result of all_to_all
# before split if the communication is not on calc stream.
if
isinstance
(
out_tensor_or_tensor_list
,
list
):
if
not
sync_op
:
dist
.
wait
(
out_tensor
,
use_calc_stream
=
False
)
out_tensor_or_tensor_list
.
extend
(
paddle
.
split
(
out_tensor
,
nranks
,
0
))
return
None
def
alltoall
(
out_tensor_or_tensor_list
,
in_tensor_or_tensor_list
,
...
...
@@ -109,9 +165,6 @@ def alltoall(
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
...
...
@@ -133,10 +186,8 @@ def alltoall(
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -149,10 +200,11 @@ def alltoall(
raise
RuntimeError
(
"The input should be specified."
)
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
out_is_tensor
=
paddle
.
is_tensor
(
out_tensor_or_tensor_list
)
in_is_tensor
=
paddle
.
is_tensor
(
in_tensor_or_tensor_list
)
if
out_is_tensor
and
in_is_tensor
:
return
_all
to
all_tensor_in_dygraph
(
return
_all
_to_
all_tensor_in_dygraph
(
out_tensor_or_tensor_list
,
in_tensor_or_tensor_list
,
group
,
...
...
@@ -160,7 +212,7 @@ def alltoall(
use_calc_stream
,
)
elif
not
out_is_tensor
and
not
in_is_tensor
:
return
_all
to
all_in_dygraph
(
return
_all
_to_
all_in_dygraph
(
out_tensor_or_tensor_list
,
in_tensor_or_tensor_list
,
group
,
...
...
@@ -171,7 +223,137 @@ def alltoall(
raise
RuntimeError
(
"The output and input should be both tensor or tensor list."
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_all_to_all_in_static_mode
(
out_tensor_or_tensor_list
,
in_tensor_or_tensor_list
,
group
,
sync_op
,
use_calc_stream
,
)
def
_alltoall_single_in_dygraph
(
out_tensor
,
in_tensor
,
out_split_sizes
,
in_split_sizes
,
group
,
sync_op
,
use_calc_stream
,
):
if
out_split_sizes
is
None
:
out_split_sizes
=
[]
if
in_split_sizes
is
None
:
in_split_sizes
=
[]
if
use_calc_stream
:
return
group
.
process_group
.
alltoall_single_on_calc_stream
(
in_tensor
,
out_tensor
,
in_split_sizes
,
out_split_sizes
)
task
=
group
.
process_group
.
alltoall_single
(
in_tensor
,
out_tensor
,
in_split_sizes
,
out_split_sizes
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
alltoall_single
(
out_tensor
,
in_tensor
,
out_split_sizes
=
None
,
in_split_sizes
=
None
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
,
):
"""
Split and Scatter the splitted input tensor to the out tensor across devices.
Args:
out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
# case 1
output = paddle.empty([2], dtype="int64")
if local_rank == 0:
data = paddle.to_tensor([0, 1])
else:
data = paddle.to_tensor([2, 3])
task = dist.stream.alltoall_single(output, data, sync_op=False)
task.wait()
out = output.numpy()
# [0, 2] (2 GPUs, out for rank 0)
# [1, 3] (2 GPUs, out for rank 1)
# case 2
size = dist.get_world_size()
output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
if local_rank == 0:
data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
else:
data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
out_split_sizes = [local_rank + 1 for i in range(size)]
in_split_sizes = [i + 1 for i in range(size)]
task = dist.stream.alltoall_single(output,
data,
out_split_sizes,
in_split_sizes,
sync_op=False)
task.wait()
out = output.numpy()
# [[0., 0.], [1., 1.]] (2 GPUs, out for rank 0)
# [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
"""
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
"use_calc_stream can only be true in sync op behavior."
)
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
return
_alltoall_single_in_dygraph
(
out_tensor
,
in_tensor
,
out_split_sizes
,
in_split_sizes
,
group
,
sync_op
,
use_calc_stream
,
)
raise
RuntimeError
(
"paddle.distributed.stream.alltoall is only supported in dygraph mode now."
"paddle.distributed.stream.alltoall
_single
is only supported in dygraph mode now."
)
python/paddle/distributed/communication/stream/alltoall_single.py
已删除
100644 → 0
浏览文件 @
ef67c8a8
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
def
_alltoall_single_in_dygraph
(
out_tensor
,
in_tensor
,
out_split_sizes
,
in_split_sizes
,
group
,
sync_op
,
use_calc_stream
,
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
if
out_split_sizes
is
None
:
out_split_sizes
=
[]
if
in_split_sizes
is
None
:
in_split_sizes
=
[]
if
use_calc_stream
:
return
group
.
process_group
.
alltoall_single_on_calc_stream
(
in_tensor
,
out_tensor
,
in_split_sizes
,
out_split_sizes
)
task
=
group
.
process_group
.
alltoall_single
(
in_tensor
,
out_tensor
,
in_split_sizes
,
out_split_sizes
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
alltoall_single
(
out_tensor
,
in_tensor
,
out_split_sizes
=
None
,
in_split_sizes
=
None
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
,
):
"""
Split and Scatter the splitted input tensor to the out tensor across devices.
Args:
out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
# case 1
output = paddle.empty([2], dtype="int64")
if local_rank == 0:
data = paddle.to_tensor([0, 1])
else:
data = paddle.to_tensor([2, 3])
task = dist.stream.alltoall_single(output, data, sync_op=False)
task.wait()
out = output.numpy()
# [0, 2] (2 GPUs, out for rank 0)
# [1, 3] (2 GPUs, out for rank 1)
# case 2
size = dist.get_world_size()
output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
if local_rank == 0:
data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
else:
data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
out_split_sizes = [local_rank + 1 for i in range(size)]
in_split_sizes = [i + 1 for i in range(size)]
task = dist.stream.alltoall_single(output,
data,
out_split_sizes,
in_split_sizes,
sync_op=False)
task.wait()
out = output.numpy()
# [[0., 0.], [1., 1.]] (2 GPUs, out for rank 0)
# [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
"use_calc_stream can only be true in sync op behavior."
)
if
framework
.
in_dygraph_mode
():
return
_alltoall_single_in_dygraph
(
out_tensor
,
in_tensor
,
out_split_sizes
,
in_split_sizes
,
group
,
sync_op
,
use_calc_stream
,
)
raise
RuntimeError
(
"paddle.distributed.stream.alltoall_single is only supported in dygraph mode now."
)
python/paddle/distributed/communication/stream/broadcast.py
浏览文件 @
99504cbb
...
...
@@ -13,29 +13,74 @@
# limitations under the License.
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
def
_broadcast_in_dygraph
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
_get_or_throw_group_rank
,
)
def
_broadcast_in_dygraph
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
if
use_calc_stream
:
return
group
.
process_group
.
broadcast_on_calc_stream
(
tensor
,
src
)
return
group
.
process_group
.
broadcast_on_calc_stream
(
tensor
,
src_rank_in_group
)
task
=
group
.
process_group
.
broadcast
(
tensor
,
src
,
sync_op
)
task
=
group
.
process_group
.
broadcast
(
tensor
,
src
_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
broadcast
(
tensor
,
src
=
0
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
def
_broadcast_in_static_mode
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
data_feeder
.
check_variable_and_dtype
(
tensor
,
'tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
,
'int8'
,
'uint8'
,
'bool'
,
],
'broadcast'
,
)
op_type
=
'c_broadcast'
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
ring_id
=
0
if
group
is
None
else
group
.
id
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
tensor
]},
outputs
=
{
'Out'
:
[
tensor
]},
attrs
=
{
'root'
:
src_rank_in_group
,
'use_calc_stream'
:
sync_op
,
'ring_id'
:
ring_id
,
},
)
return
None
def
broadcast
(
tensor
,
src
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
"""
Broadcast a tensor to all devices.
Args:
tensor (Tensor): The tensor to broadcast. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
src (int, optional): Rank of the source device.
If none is given, use `0` as default.
src (int, optional): Rank of the source device.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
...
...
@@ -65,10 +110,8 @@ def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -76,10 +119,14 @@ def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
)
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
src_rank_in_group
=
_get_or_throw_group_rank
(
src
,
group
)
return
_broadcast_in_dygraph
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_broadcast_in_static_mode
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
)
raise
RuntimeError
(
"paddle.distributed.stream.broadcast is only supported in dygraph mode now."
)
python/paddle/distributed/communication/stream/recv.py
浏览文件 @
99504cbb
...
...
@@ -13,21 +13,56 @@
# limitations under the License.
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
def
_recv_in_dygraph
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
_get_or_throw_group_rank
,
)
def
_recv_in_dygraph
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
if
use_calc_stream
:
return
group
.
process_group
.
recv_on_calc_stream
(
tensor
,
src
)
return
group
.
process_group
.
recv_on_calc_stream
(
tensor
,
src_rank_in_group
)
task
=
group
.
process_group
.
recv
(
tensor
,
src
,
sync_op
)
task
=
group
.
process_group
.
recv
(
tensor
,
src
_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
_recv_in_static_mode
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
op_type
=
'recv_v2'
data_feeder
.
check_variable_and_dtype
(
tensor
,
'tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
],
'recv'
,
)
ring_id
=
0
if
group
is
None
else
group
.
id
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
type
=
op_type
,
outputs
=
{
'Out'
:
[
tensor
]},
attrs
=
{
'ring_id'
:
ring_id
,
'peer'
:
src_rank_in_group
,
'out_shape'
:
tensor
.
shape
,
'dtype'
:
tensor
.
dtype
,
'use_calc_stream'
:
sync_op
,
},
)
return
None
def
recv
(
tensor
,
src
=
0
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
"""
...
...
@@ -44,9 +79,6 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
...
...
@@ -66,10 +98,8 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]] (2 GPUs)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -77,8 +107,14 @@ def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
)
if
framework
.
in_dygraph_mode
():
return
_recv_in_dygraph
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
)
group
=
_get_global_group
()
if
group
is
None
else
group
src_rank_in_group
=
_get_or_throw_group_rank
(
src
,
group
)
raise
RuntimeError
(
"paddle.distributed.stream.recv is only supported in dygraph mode now."
)
return
_recv_in_dygraph
(
tensor
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_recv_in_static_mode
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
)
python/paddle/distributed/communication/stream/reduce.py
浏览文件 @
99504cbb
...
...
@@ -13,23 +13,70 @@
# limitations under the License.
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.group
import
_get_global_group
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
_get_or_throw_group_rank
,
)
from
paddle.distributed.communication.reduce
import
_get_reduce_op
,
ReduceOp
def
_reduce_in_dygraph
(
tensor
,
dst
,
op
,
group
,
sync_op
,
use_calc_stream
):
def
_reduce_in_dygraph
(
tensor
,
dst_rank_in_group
,
op
,
group
,
sync_op
,
use_calc_stream
):
op_type
=
_get_reduce_op
(
op
,
"reduce"
)
group
=
_get_global_group
()
if
group
is
None
else
group
if
use_calc_stream
:
return
group
.
process_group
.
reduce_on_calc_stream
(
tensor
,
dst
,
op_type
)
return
group
.
process_group
.
reduce_on_calc_stream
(
tensor
,
dst_rank_in_group
,
op_type
)
task
=
group
.
process_group
.
reduce
(
tensor
,
dst
,
op_type
,
sync_op
)
task
=
group
.
process_group
.
reduce
(
tensor
,
dst_rank_in_group
,
op_type
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
_reduce_in_static_mode
(
tensor
,
dst_rank_in_group
,
op
,
group
,
sync_op
,
use_calc_stream
):
data_feeder
.
check_variable_and_dtype
(
tensor
,
'tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
,
'int8'
,
'uint8'
,
'bool'
,
],
'reduce'
,
)
op_type
=
_get_reduce_op
(
op
,
"reduce"
)
ring_id
=
0
if
group
is
None
else
group
.
id
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
tensor
]},
outputs
=
{
'Out'
:
[
tensor
]},
attrs
=
{
'ring_id'
:
ring_id
,
'use_calc_stream'
:
sync_op
,
'root_id'
:
dst_rank_in_group
,
},
)
return
None
def
reduce
(
tensor
,
dst
=
0
,
...
...
@@ -77,10 +124,8 @@ def reduce(
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -88,10 +133,13 @@ def reduce(
)
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
dst_rank_in_group
=
_get_or_throw_group_rank
(
dst
,
group
)
return
_reduce_in_dygraph
(
tensor
,
dst_rank_in_group
,
op
,
group
,
sync_op
,
use_calc_stream
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_reduce_in_static_mode
(
tensor
,
dst
,
op
,
group
,
sync_op
,
use_calc_stream
)
raise
RuntimeError
(
"paddle.distributed.stream.reduce is only supported in dygraph mode now."
)
python/paddle/distributed/communication/stream/reduce_scatter.py
浏览文件 @
99504cbb
...
...
@@ -14,7 +14,10 @@
import
paddle
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.group
import
_get_global_group
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
)
from
paddle.distributed.communication.reduce
import
_get_reduce_op
,
ReduceOp
...
...
@@ -104,7 +107,7 @@ def reduce_scatter(
Args:
tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
tensor_
list (
List[Tensor]]): The input to scatter.
tensor_
or_tensor_list (Union[Tensor,
List[Tensor]]): The input to scatter.
If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
...
...
@@ -137,10 +140,8 @@ def reduce_scatter(
# [4, 6] (2 GPUs, out for rank 0)
# [8, 10] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -220,10 +221,8 @@ def _reduce_scatter_base(
# [1, 2, 3] (2 GPUs, out for rank 0)
# [4, 5, 6] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
python/paddle/distributed/communication/stream/scatter.py
浏览文件 @
99504cbb
...
...
@@ -12,10 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
warnings
import
paddle
import
paddle.distributed
as
dist
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
_get_or_throw_group_rank
,
)
def
_check_tensor_shape
(
tensor
,
shape
,
nranks
=
1
):
...
...
@@ -38,26 +45,19 @@ def _check_tensor_list_shape(tensor_list, shape, nranks=1):
def
_scatter_tensor_in_dygraph
(
out_tensor
,
in_tensor
,
src
,
group
,
sync_op
,
use_calc_stream
out_tensor
,
in_tensor
,
src
_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
src_rank
=
group
.
get_group_rank
(
src
)
if
src_rank
==
-
1
:
raise
RuntimeError
(
"Src rank out of group."
)
nranks
=
group
.
nranks
rank
=
dist
.
get_rank
()
if
rank
==
src_rank
:
if
group
.
rank
==
src_rank_in_group
:
_check_tensor_shape
(
out_tensor
,
in_tensor
.
shape
,
nranks
)
if
use_calc_stream
:
return
group
.
process_group
.
scatter_tensor_on_calc_stream
(
in_tensor
,
out_tensor
,
src
in_tensor
,
out_tensor
,
src
_rank_in_group
)
task
=
group
.
process_group
.
scatter_tensor
(
in_tensor
,
out_tensor
,
src
,
sync_op
in_tensor
,
out_tensor
,
src
_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
...
...
@@ -66,17 +66,10 @@ def _scatter_tensor_in_dygraph(
def
_scatter_in_dygraph
(
tensor
,
tensor_list
,
src
,
group
,
sync_op
,
use_calc_stream
tensor
,
tensor_list
,
src
_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
src_rank
=
group
.
get_group_rank
(
src
)
if
src_rank
==
-
1
:
raise
RuntimeError
(
"Src rank out of group."
)
nranks
=
group
.
nranks
rank
=
dist
.
get_rank
()
if
rank
==
src_rank
:
if
group
.
rank
==
src_rank_in_group
:
if
len
(
tensor_list
)
==
0
:
raise
RuntimeError
(
"The tensor_list should not be empty on src rank."
...
...
@@ -87,16 +80,76 @@ def _scatter_in_dygraph(
if
use_calc_stream
:
return
group
.
process_group
.
scatter_on_calc_stream
(
tensor_list
,
tensor
,
src
tensor_list
,
tensor
,
src
_rank_in_group
)
task
=
group
.
process_group
.
scatter
(
tensor_list
,
tensor
,
src
,
sync_op
)
task
=
group
.
process_group
.
scatter
(
tensor_list
,
tensor
,
src_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
_scatter_in_static_mode
(
tensor
,
tensor_or_tensor_list
,
src_rank_in_group
,
group
,
sync_op
,
use_calc_stream
,
):
nranks
=
dist
.
get_world_size
()
if
group
is
None
else
group
.
nranks
rank
=
dist
.
get_rank
()
input_tensor
=
tensor_or_tensor_list
if
isinstance
(
tensor_or_tensor_list
,
list
):
tensor_list
=
tensor_or_tensor_list
if
rank
==
src_rank_in_group
:
if
len
(
tensor_list
)
==
0
:
raise
RuntimeError
(
"The tensor_list should not be empty on src rank."
)
else
:
tensor_list
=
[
tensor
for
_
in
range
(
nranks
)]
input_tensor
=
paddle
.
concat
(
tensor_list
,
axis
=
0
)
ring_id
=
0
if
group
is
None
else
group
.
id
data_feeder
.
check_variable_and_dtype
(
tensor
,
'tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
,
'int8'
,
'uint8'
,
'bool'
,
],
'scatter'
,
)
op_type
=
'c_scatter'
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
input_tensor
]},
outputs
=
{
'Out'
:
[
tensor
]},
attrs
=
{
'ring_id'
:
ring_id
,
'root'
:
src_rank_in_group
,
'use_calc_stream'
:
sync_op
,
'nranks'
:
nranks
,
},
)
return
None
def
scatter
(
tensor
,
tensor_or_tensor_list
=
None
,
...
...
@@ -146,25 +199,34 @@ def scatter(
# [1, 2, 3] (2 GPUs, out for rank 0)
# [4, 5, 6] (2 GPUs, out for rank 1)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
"use_calc_stream can only be true in sync op behavior."
)
if
tensor_or_tensor_list
is
None
:
raise
RuntimeError
(
"The input should be specified."
)
# NOTE(liyurui): Only the source rank needs to specific the tensor_or_tensor_list argument.
# Other ranks which pass this argument in will be ignored with a warning.
# If a tensor_list passed in, we need to concat it to a tensor before invoke C++ API.
# If a tensor passed in, concat is not needed.
# The passed in type for non-src rank is meaningless, for it will be ignored.
if
src
!=
dist
.
get_rank
():
if
tensor_or_tensor_list
is
not
None
:
warnings
.
warn
(
"Specific `tensor_or_tensor_list` is meaningless for rank which is not src."
)
tensor_or_tensor_list
=
[]
if
framework
.
in_dygraph_mode
():
group
=
_get_global_group
()
if
group
is
None
else
group
src_rank_in_group
=
_get_or_throw_group_rank
(
src
,
group
)
if
paddle
.
is_tensor
(
tensor_or_tensor_list
):
return
_scatter_tensor_in_dygraph
(
tensor
,
tensor_or_tensor_list
,
src
,
src
_rank_in_group
,
group
,
sync_op
,
use_calc_stream
,
...
...
@@ -173,12 +235,19 @@ def scatter(
return
_scatter_in_dygraph
(
tensor
,
tensor_or_tensor_list
,
src
,
src
_rank_in_group
,
group
,
sync_op
,
use_calc_stream
,
)
raise
RuntimeError
(
"paddle.distributed.stream.scatter is only supported in dygraph mode now."
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_scatter_in_static_mode
(
tensor
,
tensor_or_tensor_list
,
src
,
group
,
sync_op
,
use_calc_stream
,
)
python/paddle/distributed/communication/stream/send.py
浏览文件 @
99504cbb
...
...
@@ -13,21 +13,55 @@
# limitations under the License.
import
paddle.fluid.framework
as
framework
from
paddle.distributed
import
collective
def
_send_in_dygraph
(
tensor
,
dst
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
import
paddle.fluid.data_feeder
as
data_feeder
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.distributed.communication.group
import
(
_get_global_group
,
_warn_cur_rank_not_in_group
,
_get_or_throw_group_rank
,
)
def
_send_in_dygraph
(
tensor
,
dst_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
if
use_calc_stream
:
return
group
.
process_group
.
send_on_calc_stream
(
tensor
,
dst
)
return
group
.
process_group
.
send_on_calc_stream
(
tensor
,
dst_rank_in_group
)
task
=
group
.
process_group
.
send
(
tensor
,
dst
,
sync_op
)
task
=
group
.
process_group
.
send
(
tensor
,
dst
_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
_send_in_static_mode
(
tensor
,
dst_rank_in_group
,
group
,
sync_op
,
use_calc_stream
):
op_type
=
'send_v2'
data_feeder
.
check_variable_and_dtype
(
tensor
,
'tensor'
,
[
'float16'
,
'float32'
,
'float64'
,
'int32'
,
'int64'
],
'send'
,
)
ring_id
=
0
if
group
is
None
else
group
.
id
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
tensor
]},
attrs
=
{
'ring_id'
:
ring_id
,
'peer'
:
dst_rank_in_group
,
'use_calc_stream'
:
sync_op
,
},
)
return
None
def
send
(
tensor
,
dst
=
0
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
"""
...
...
@@ -44,9 +78,6 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
...
...
@@ -66,10 +97,8 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]] (2 GPUs)
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
_warn_cur_rank_not_in_group
(
group
):
return
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
...
...
@@ -77,8 +106,14 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
)
if
framework
.
in_dygraph_mode
():
return
_send_in_dygraph
(
tensor
,
dst
,
group
,
sync_op
,
use_calc_stream
)
group
=
_get_global_group
()
if
group
is
None
else
group
dst_rank_in_group
=
_get_or_throw_group_rank
(
dst
,
group
)
raise
RuntimeError
(
"paddle.distributed.stream.send is only supported in dygraph mode now."
)
return
_send_in_dygraph
(
tensor
,
dst_rank_in_group
,
group
,
sync_op
,
use_calc_stream
)
else
:
assert
group
is
None
,
"Group can not be used in static mode for now."
return
_send_in_static_mode
(
tensor
,
dst
,
group
,
sync_op
,
use_calc_stream
)
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
浏览文件 @
99504cbb
...
...
@@ -22,7 +22,7 @@ from paddle.fluid.layer_helper import LayerHelper
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.dygraph
import
layers
from
paddle.distributed
import
collective
from
....communication.reduce
import
ReduceOp
from
....communication.reduce
import
ReduceOp
,
_get_reduce_op
from
paddle.fluid.data_feeder
import
check_dtype
import
paddle.fluid.dygraph_utils
as
dygraph_utils
...
...
@@ -61,7 +61,7 @@ def _c_identity(tensor, group=None):
@
staticmethod
def
backward
(
ctx
,
dy
):
op_type
=
collective
.
_get_reduce_op
(
ReduceOp
.
SUM
,
"_c_identity"
)
op_type
=
_get_reduce_op
(
ReduceOp
.
SUM
,
"_c_identity"
)
group
.
process_group
.
allreduce_on_calc_stream
(
dy
,
op_type
)
return
dy
...
...
@@ -254,7 +254,7 @@ def _mp_allreduce(
ctx
.
ring_id
=
group
.
id
if
use_calc_stream
:
op_type
=
collective
.
_get_reduce_op
(
op
,
"_mp_allreduce"
)
op_type
=
_get_reduce_op
(
op
,
"_mp_allreduce"
)
group
.
process_group
.
allreduce_on_calc_stream
(
tensor
,
op_type
)
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
浏览文件 @
99504cbb
...
...
@@ -27,13 +27,13 @@ import numpy as np
from
collections
import
OrderedDict
import
paddle
import
paddle.distributed
as
dist
from
paddle.fluid
import
core
from
paddle.optimizer
import
Optimizer
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.distributed.collective
import
(
_get_global_group
,
new_group
,
broadcast
,
wait
,
)
...
...
@@ -169,7 +169,7 @@ class ShardingOptimizerStage2(Optimizer):
"""
for
p
in
self
.
_local_params
:
broadcast
(
dist
.
broadcast
(
p
,
src
=
self
.
_global_root_rank
,
group
=
self
.
group
,
sync_op
=
True
)
...
...
@@ -456,7 +456,7 @@ class ShardingOptimizerStage2(Optimizer):
# Exchange all the shards with the other ranks
for
dtype_per_rank
in
self
.
param_storages
.
values
():
for
dst_rank
,
internal_storage
in
dtype_per_rank
.
items
():
broadcast
(
dist
.
broadcast
(
tensor
=
internal_storage
.
buffer
,
src
=
self
.
group
.
ranks
[
dst_rank
],
group
=
self
.
group
,
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
浏览文件 @
99504cbb
...
...
@@ -28,6 +28,7 @@ import warnings
from
collections
import
OrderedDict
import
paddle
import
paddle.distributed
as
dist
from
paddle.fluid
import
core
from
paddle.optimizer
import
Optimizer
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
...
...
@@ -38,7 +39,6 @@ HybridParallelClipGrad = (
)
from
paddle.distributed.collective
import
(
_get_global_group
,
broadcast
,
new_group
,
)
...
...
@@ -206,12 +206,12 @@ class GroupShardedOptimizerStage2(Optimizer):
"""
for
p
in
self
.
_local_params
:
broadcast
(
dist
.
broadcast
(
p
,
src
=
self
.
_global_root_rank
,
group
=
self
.
_group
,
sync_op
=
True
)
if
self
.
_dp_group
:
broadcast
(
dist
.
broadcast
(
p
,
src
=
self
.
_dp_group
.
ranks
[
0
],
group
=
self
.
_dp_group
,
...
...
@@ -562,7 +562,7 @@ class GroupShardedOptimizerStage2(Optimizer):
else
:
for
dtype_per_rank
in
self
.
param_storages
.
values
():
for
dst_rank
,
internal_storage
in
dtype_per_rank
.
items
():
broadcast
(
dist
.
broadcast
(
tensor
=
internal_storage
.
buffer
,
src
=
self
.
_group
.
ranks
[
dst_rank
],
group
=
self
.
_group
,
...
...
@@ -590,7 +590,7 @@ class GroupShardedOptimizerStage2(Optimizer):
if
x
.
trainable
:
group
=
self
.
_broadcast_groups
[
group_idx
]
group_idx
=
(
group_idx
+
1
)
%
self
.
_number_of_broadcast_groups
task
=
broadcast
(
task
=
dist
.
broadcast
(
tensor
=
x
,
src
=
group
.
ranks
[
self
.
_param2rank
[
x
.
name
]],
group
=
group
,
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
浏览文件 @
99504cbb
...
...
@@ -27,6 +27,7 @@ from functools import reduce
from
types
import
MethodType
import
paddle
import
paddle.distributed
as
dist
from
paddle
import
nn
from
paddle.distributed
import
collective
from
paddle.distributed.utils.log_utils
import
get_logger
...
...
@@ -324,12 +325,12 @@ class GroupShardedStage2(nn.Layer):
"""
for
buffer
in
self
.
_layer
.
buffers
(
include_sublayers
=
True
):
collective
.
broadcast
(
dist
.
broadcast
(
buffer
,
self
.
_global_root_rank
,
self
.
_group
,
sync_op
=
True
)
if
self
.
_dp_group
and
self
.
_dp_group
.
nranks
>
1
:
collective
.
broadcast
(
dist
.
broadcast
(
buffer
,
self
.
_dp_group
.
ranks
[
0
],
self
.
_dp_group
,
...
...
@@ -402,7 +403,7 @@ class GroupShardedStage2(nn.Layer):
# Synchronize the reduce parameter gradient asynchronize
self
.
_sharding_optimizers
[
0
].
_update_task
(
collective
.
reduce
(
dist
.
reduce
(
tensor
=
param
.
grad
,
dst
=
self
.
_group
.
ranks
[
dst_rank
],
group
=
self
.
_group
,
...
...
@@ -415,7 +416,7 @@ class GroupShardedStage2(nn.Layer):
not
self
.
_reduce_overlap
),
'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
# TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
collective
.
all_reduce
(
dist
.
all_reduce
(
tensor
=
param
.
grad
,
group
=
self
.
_dp_group
,
sync_op
=
True
,
...
...
@@ -469,7 +470,7 @@ class GroupShardedStage2(nn.Layer):
grad_storage
.
sent
=
True
# Synchronize the reduce parameter gradient asynchronize
self
.
_sharding_optimizers
[
0
].
_update_task
(
collective
.
reduce
(
dist
.
reduce
(
tensor
=
grad_storage
.
buffer
,
dst
=
self
.
_group
.
ranks
[
grad_storage
.
destination
],
group
=
self
.
_group
,
...
...
@@ -482,7 +483,7 @@ class GroupShardedStage2(nn.Layer):
not
self
.
_reduce_overlap
),
'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
# TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
collective
.
all_reduce
(
dist
.
all_reduce
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_dp_group
,
sync_op
=
True
,
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
浏览文件 @
99504cbb
...
...
@@ -18,6 +18,7 @@ from types import MethodType
from
collections
import
OrderedDict
import
paddle
import
paddle.distributed
as
dist
from
paddle
import
nn
from
paddle.autograd
import
PyLayer
import
paddle.fluid.core
as
core
...
...
@@ -196,7 +197,7 @@ class GroupShardedStage3(nn.Layer):
"""
for
p
in
self
.
_layer
.
parameters
():
collective
.
broadcast
(
dist
.
broadcast
(
p
,
src
=
self
.
_global_root_rank
,
group
=
self
.
_group
,
sync_op
=
True
)
...
...
@@ -493,7 +494,7 @@ class GroupShardedStage3(nn.Layer):
"""
for
buffer
in
self
.
_layer
.
buffers
(
include_sublayers
=
True
):
collective
.
broadcast
(
dist
.
broadcast
(
buffer
,
self
.
_global_root_rank
,
self
.
_group
,
sync_op
=
True
)
...
...
@@ -536,7 +537,7 @@ class GroupShardedStage3(nn.Layer):
# 2.Handle unslice param
for
grad_storage
in
self
.
_grad_storages
.
values
():
grad_storage
.
buffer
.
scale_
(
scale
=
self
.
_world_size_scaling
)
collective
.
all_reduce
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
)
dist
.
all_reduce
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
)
if
self
.
_offload
:
for
param
in
list
(
self
.
_unslice_params
):
param
.
_clear_data
()
...
...
@@ -600,7 +601,7 @@ class GroupShardedStage3(nn.Layer):
if
param
.
name
in
self
.
_task_flow
.
full_grad
.
keys
():
full_grad
=
self
.
_task_flow
.
full_grad
[
param
.
name
]
# Only support sync allreduce current rank's layer now
collective
.
all_reduce
(
tensor
=
full_grad
,
group
=
self
.
_group
)
dist
.
all_reduce
(
tensor
=
full_grad
,
group
=
self
.
_group
)
start
,
end
=
self
.
_param2buffer
[
param
.
name
][
self
.
_rank
]
if
param
.
bw_storage
is
None
:
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
浏览文件 @
99504cbb
...
...
@@ -30,8 +30,9 @@ from collections import deque
from
types
import
MethodType
import
paddle
import
paddle.distributed
as
dist
from
paddle
import
nn
from
paddle.distributed
import
collective
as
dist
from
paddle.distributed
import
collective
as
collective
from
paddle.distributed.collective
import
_get_global_group
from
...utils.internal_storage
import
GradStorage
...
...
@@ -92,7 +93,7 @@ class ShardingStage2(nn.Layer):
# Communication related attributes
self
.
_group
=
(
dist
.
new_group
(
_get_global_group
().
ranks
)
collective
.
new_group
(
_get_global_group
().
ranks
)
if
group
is
None
else
group
)
...
...
@@ -317,7 +318,7 @@ class ShardingStage2(nn.Layer):
buffer
,
self
.
_global_root_rank
,
self
.
_group
,
sync_op
=
True
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
collective
.
wait
(
tensor
=
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
def
__getattr__
(
self
,
name
):
"""Forward missing attributes to wrapped layer."""
...
...
@@ -381,7 +382,7 @@ class ShardingStage2(nn.Layer):
)
# Multi stream operation will be supported later
dist
.
wait
(
collective
.
wait
(
tensor
=
param
.
grad
,
group
=
self
.
_group
,
use_calc_stream
=
True
,
...
...
@@ -447,7 +448,7 @@ class ShardingStage2(nn.Layer):
)
# Multi stream operation will be supported later
dist
.
wait
(
collective
.
wait
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
,
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
浏览文件 @
99504cbb
...
...
@@ -18,12 +18,13 @@ from types import MethodType
from
collections
import
OrderedDict
import
paddle
import
paddle.distributed
as
dist
from
paddle
import
nn
from
paddle.autograd
import
PyLayer
import
paddle.fluid.core
as
core
from
paddle.fluid.framework
import
ParamBase
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.distributed
import
collective
as
dist
from
paddle.distributed
import
collective
from
paddle.distributed.collective
import
_get_global_group
from
.sharding_utils
import
Type
,
ShardingClipGrad
,
device_guard
...
...
@@ -101,7 +102,7 @@ class ShardingStage3(nn.Layer):
# Communication group establishment
self
.
_group
=
(
dist
.
new_group
(
_get_global_group
().
ranks
)
collective
.
new_group
(
_get_global_group
().
ranks
)
if
group
is
None
else
group
)
...
...
@@ -183,7 +184,7 @@ class ShardingStage3(nn.Layer):
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
p
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
collective
.
wait
(
tensor
=
p
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
def
_clear_gradients
(
self
):
assert
len
(
self
.
_trainable_params
.
keys
())
>
0
...
...
@@ -484,7 +485,7 @@ class ShardingStage3(nn.Layer):
buffer
,
self
.
_global_root_rank
,
self
.
_group
,
sync_op
=
True
)
# Multi stream operation will be supported later
dist
.
wait
(
tensor
=
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
collective
.
wait
(
tensor
=
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
def
__getattr__
(
self
,
name
):
"""Forward missing attributes to wrapped layer."""
...
...
@@ -528,7 +529,7 @@ class ShardingStage3(nn.Layer):
dist
.
all_reduce
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
,
sync_op
=
True
)
dist
.
wait
(
collective
.
wait
(
tensor
=
grad_storage
.
buffer
,
group
=
self
.
_group
,
use_calc_stream
=
True
,
...
...
@@ -600,7 +601,7 @@ class ShardingStage3(nn.Layer):
dist
.
all_reduce
(
tensor
=
full_grad
,
group
=
self
.
_group
,
sync_op
=
True
)
dist
.
wait
(
collective
.
wait
(
tensor
=
full_grad
,
group
=
self
.
_group
,
use_calc_stream
=
True
)
...
...
@@ -945,7 +946,7 @@ def _allgather_buffer(
# Allgather current layer in the 1st step synchronously
if
sync_wait
:
with
paddle
.
amp
.
auto_cast
(
enable
=
False
):
dist
.
wait
(
collective
.
wait
(
tensor
=
full_param
,
group
=
group
,
use_calc_stream
=
use_calc_stream
,
...
...
python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
浏览文件 @
99504cbb
...
...
@@ -17,6 +17,7 @@ import unittest
import
paddle
import
numpy
as
np
import
paddle.distributed
as
dist
from
paddle.distributed.communication.reduce_scatter
import
_reduce_scatter_base
class
TestCollectiveReduceScatter
(
unittest
.
TestCase
):
...
...
@@ -75,9 +76,7 @@ class TestCollectiveReduceScatter(unittest.TestCase):
# [1, 2, 3, 4] # Rank-1
output
=
paddle
.
empty
(
shape
=
[
2
],
dtype
=
input
.
dtype
)
task
=
paddle
.
distributed
.
collective
.
_reduce_scatter_base
(
output
,
input
,
sync_op
=
False
)
task
=
_reduce_scatter_base
(
output
,
input
,
sync_op
=
False
)
task
.
wait
()
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
99504cbb
...
...
@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.distributed
as
dist
from
paddle.fluid.clip
import
ClipGradBase
,
_squared_l2_norm
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid
import
core
,
layers
from
paddle.distributed
import
collective
class
ClipGradForMOEByGlobalNorm
(
ClipGradBase
):
...
...
@@ -185,9 +185,9 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
moe_params_grads
,
sum_dtype
)
if
global_norm_var_moe
is
not
None
:
collective
.
all_reduce
(
dist
.
all_reduce
(
global_norm_var_moe
,
op
=
collective
.
ReduceOp
.
SUM
,
op
=
dist
.
ReduceOp
.
SUM
,
group
=
self
.
moe_group
,
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录