Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
77c010a0
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
77c010a0
编写于
7月 13, 2022
作者:
S
ShenLiang
提交者:
GitHub
7月 13, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug of pp (#44276)
上级
b1aa693e
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
25 addition
and
15 deletion
+25
-15
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
...ributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+25
-15
未找到文件。
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
浏览文件 @
77c010a0
...
...
@@ -54,25 +54,29 @@ class SendRecvMeta:
def
_recv_shape_dtype
(
self
,
group
):
# recv len(shape)
dims
=
paddle
.
to_tensor
([
0
])
paddle
.
distributed
.
recv
(
dims
,
src
=
0
,
group
=
group
)
src_rank
=
group
.
ranks
[
0
]
paddle
.
distributed
.
recv
(
dims
,
src
=
src_rank
,
group
=
group
)
dims
=
dims
.
item
()
# recv shape
shape
=
paddle
.
to_tensor
([
0
]
*
dims
)
paddle
.
distributed
.
recv
(
shape
,
src
=
0
,
group
=
group
)
paddle
.
distributed
.
recv
(
shape
,
src
=
src_rank
,
group
=
group
)
# recv dtype
dtype
=
paddle
.
to_tensor
([
0
])
paddle
.
distributed
.
recv
(
dtype
,
src
=
0
,
group
=
group
)
paddle
.
distributed
.
recv
(
dtype
,
src
=
src_rank
,
group
=
group
)
# recv stop_gradient
stop_grad
=
paddle
.
to_tensor
([
0
])
paddle
.
distributed
.
recv
(
stop_grad
,
src
=
0
,
group
=
group
)
paddle
.
distributed
.
recv
(
stop_grad
,
src
=
src_rank
,
group
=
group
)
return
shape
.
numpy
().
tolist
(),
dtype
.
item
(),
stop_grad
.
item
()
def
recv_meta
(
self
,
group
):
tensor_type
=
paddle
.
to_tensor
([
0
])
paddle
.
distributed
.
recv
(
tensor_type
,
src
=
0
,
group
=
group
)
src_rank
=
group
.
ranks
[
0
]
paddle
.
distributed
.
recv
(
tensor_type
,
src
=
src_rank
,
group
=
group
)
tensor_type
=
tensor_type
.
item
()
if
tensor_type
==
0
:
...
...
@@ -83,7 +87,7 @@ class SendRecvMeta:
elif
tensor_type
==
1
:
num
=
paddle
.
to_tensor
([
0
])
paddle
.
distributed
.
recv
(
num
,
src
=
0
,
group
=
group
)
paddle
.
distributed
.
recv
(
num
,
src
=
src_rank
,
group
=
group
)
num
=
num
.
item
()
shapes
=
[]
dtypes
=
[]
...
...
@@ -101,34 +105,38 @@ class SendRecvMeta:
def
_send_dims_shape_dtype
(
self
,
tensor
,
group
):
# send len(shape)
dims
=
paddle
.
to_tensor
(
len
(
tensor
.
shape
))
paddle
.
distributed
.
send
(
dims
,
dst
=
1
,
group
=
group
)
dst_rank
=
group
.
ranks
[
1
]
paddle
.
distributed
.
send
(
dims
,
dst
=
dst_rank
,
group
=
group
)
# send shape
shape
=
paddle
.
to_tensor
(
tensor
.
shape
)
paddle
.
distributed
.
send
(
shape
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
shape
,
dst
=
dst_rank
,
group
=
group
)
# send dtype
dtype
=
paddle
.
to_tensor
(
paddle_2_number
(
tensor
.
dtype
))
paddle
.
distributed
.
send
(
dtype
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
dtype
,
dst
=
dst_rank
,
group
=
group
)
# send trainable
stop_grad
=
paddle
.
to_tensor
(
int
(
tensor
.
stop_gradient
))
paddle
.
distributed
.
send
(
stop_grad
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
stop_grad
,
dst
=
dst_rank
,
group
=
group
)
def
send_meta
(
self
,
tensor
,
group
):
dst_rank
=
group
.
ranks
[
1
]
if
isinstance
(
tensor
,
(
paddle
.
Tensor
,
core
.
eager
.
Tensor
)):
tensor_type
=
paddle
.
to_tensor
([
0
])
# send tensor type
paddle
.
distributed
.
send
(
tensor_type
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
tensor_type
,
dst
=
dst_rank
,
group
=
group
)
self
.
_send_dims_shape_dtype
(
tensor
,
group
)
elif
isinstance
(
tensor
,
tuple
):
tensor_type
=
paddle
.
to_tensor
([
1
])
# send tensor type
paddle
.
distributed
.
send
(
tensor_type
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
tensor_type
,
dst
=
dst_rank
,
group
=
group
)
nums
=
paddle
.
to_tensor
(
len
(
tensor
))
paddle
.
distributed
.
send
(
nums
,
dst
=
1
,
group
=
group
)
paddle
.
distributed
.
send
(
nums
,
dst
=
dst_rank
,
group
=
group
)
for
d
in
tensor
:
assert
isinstance
(
d
,
(
paddle
.
Tensor
,
core
.
eager
.
Tensor
))
...
...
@@ -166,6 +174,7 @@ def send_partial(tensor,
rank_id
=
0
,
group
=
None
,
use_calc_stream
=
True
):
# dst: local rank in group
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
...
...
@@ -176,7 +185,7 @@ def send_partial(tensor,
dst
,
'num'
,
nranks
,
'id'
,
rank_id
)
else
:
return
paddle
.
distributed
.
send
(
tensor
.
detach
(),
dst
=
dst
,
dst
=
group
.
ranks
[
dst
]
,
group
=
group
,
use_calc_stream
=
use_calc_stream
)
...
...
@@ -187,6 +196,7 @@ def recv_partial(tensor,
rank_id
=
0
,
group
=
None
,
use_calc_stream
=
True
):
# src: local rank in group
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
...
...
@@ -198,7 +208,7 @@ def recv_partial(tensor,
tensor
.
shape
)
else
:
paddle
.
distributed
.
recv
(
tensor
.
detach
(),
src
=
src
,
src
=
group
.
ranks
[
src
]
,
group
=
group
,
use_calc_stream
=
use_calc_stream
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录