Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f7784700
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f7784700
编写于
9月 22, 2022
作者:
Y
Yuang Liu
提交者:
GitHub
9月 22, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[interleave pp] sync recv for 1f1b (#46399)
上级
608181a9
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
32 addition
and
18 deletion
+32
-18
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
...ddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+3
-2
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
...ributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+29
-16
未找到文件。
python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
浏览文件 @
f7784700
...
...
@@ -526,7 +526,7 @@ class PipelineParallelWithInterleave(PipelineParallel):
self
.
set_virtual_pipeline_rank
(
0
)
self
.
input_tensors
[
0
].
append
(
p2p
.
recv_forward
(
self
.
is_pipeline_first_stage
()))
p2p
.
recv_forward
(
self
.
is_pipeline_first_stage
()
,
sync_recv
=
False
))
# run startup steps
for
micro_step
in
range
(
startup_steps
):
...
...
@@ -647,7 +647,8 @@ class PipelineParallelWithInterleave(PipelineParallel):
if
not
forward_only
:
if
all_startup_steps
:
self
.
output_tensor_grads
[
self
.
num_model_chunks
-
1
].
append
(
p2p
.
recv_backward
(
self
.
is_pipeline_last_stage
()))
p2p
.
recv_backward
(
self
.
is_pipeline_last_stage
(),
sync_recv
=
False
))
for
micro_step
in
range
(
steady_steps
,
num_steps
):
# cooldown loop
...
...
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
浏览文件 @
f7784700
...
...
@@ -207,6 +207,7 @@ def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks,
rank_id
):
src_rank_in_group
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
if
_in_legacy_dygraph
():
assert
use_calc_stream
return
_legacy_C_ops
.
partial_recv
(
tensor
.
detach
(),
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
src_rank_in_group
,
'num'
,
...
...
@@ -216,8 +217,11 @@ def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks,
elif
in_dygraph_mode
():
group
=
paddle
.
distributed
.
collective
.
_get_default_group
(
)
if
group
is
None
else
group
return
group
.
process_group
.
recv_partial
(
tensor
,
src_rank_in_group
,
task
=
group
.
process_group
.
recv_partial
(
tensor
,
src_rank_in_group
,
nranks
,
rank_id
)
if
use_calc_stream
:
task
.
wait
()
return
task
def
recv_partial
(
tensor
,
...
...
@@ -238,7 +242,7 @@ def recv_partial(tensor,
return
_partial_recv_op
(
tensor
,
group
,
use_calc_stream
,
ring_id
,
src_rank
,
nranks
,
rank_id
)
else
:
if
_in_legacy_dygraph
():
if
_in_legacy_dygraph
()
or
use_calc_stream
:
recv_op
=
paddle
.
distributed
.
recv
elif
in_dygraph_mode
():
recv_op
=
paddle
.
distributed
.
irecv
...
...
@@ -275,7 +279,11 @@ def allgather_partial(tensor,
nranks
,
rank_id
)
def
_p2p_helper
(
tensor_send_next
,
tensor_send_prev
,
recv_prev
,
recv_next
):
def
_p2p_helper
(
tensor_send_next
,
tensor_send_prev
,
recv_prev
,
recv_next
,
sync_recv
=
True
):
global
_hcg
tensor_recv_prev
=
None
...
...
@@ -354,7 +362,7 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
nranks
=
mp_degree
,
rank_id
=
mp_rank
,
group
=
_hcg
.
recv_prev_group
,
use_calc_stream
=
True
))
use_calc_stream
=
sync_recv
))
else
:
tasks
.
append
(
recv_partial
(
tensor_recv_prev
,
...
...
@@ -362,7 +370,7 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
nranks
=
mp_degree
,
rank_id
=
mp_rank
,
group
=
_hcg
.
recv_prev_group
,
use_calc_stream
=
True
))
use_calc_stream
=
sync_recv
))
if
tensor_send_next
is
not
None
:
if
isinstance
(
tensor_send_next
,
tuple
):
...
...
@@ -394,7 +402,7 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
nranks
=
mp_degree
,
rank_id
=
mp_rank
,
group
=
_hcg
.
recv_next_group
,
use_calc_stream
=
True
))
use_calc_stream
=
sync_recv
))
else
:
tasks
.
append
(
...
...
@@ -403,10 +411,10 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
nranks
=
mp_degree
,
rank_id
=
mp_rank
,
group
=
_hcg
.
recv_next_group
,
use_calc_stream
=
True
))
use_calc_stream
=
sync_recv
))
if
in_dygraph_mode
():
# wait i
send/i
recv tasks in eager dygraph mode with new comm library
if
not
sync_recv
and
in_dygraph_mode
():
# wait irecv tasks in eager dygraph mode with new comm library
for
task
in
tasks
:
assert
task
is
not
None
task
.
wait
()
...
...
@@ -443,7 +451,7 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
return
tensor_recv_prev
,
tensor_recv_next
def
recv_forward
(
pp_first_stage
):
def
recv_forward
(
pp_first_stage
,
sync_recv
=
True
):
if
pp_first_stage
:
input_tensor
=
None
else
:
...
...
@@ -454,18 +462,20 @@ def recv_forward(pp_first_stage):
input_tensor
,
_
=
_p2p_helper
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
True
,
recv_next
=
False
)
recv_next
=
False
,
sync_recv
=
sync_recv
)
return
input_tensor
def
recv_backward
(
pp_last_stage
):
def
recv_backward
(
pp_last_stage
,
sync_recv
=
True
):
if
pp_last_stage
:
output_tensor_grad
=
None
else
:
_
,
output_tensor_grad
=
_p2p_helper
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
True
)
recv_next
=
True
,
sync_recv
=
sync_recv
)
return
output_tensor_grad
...
...
@@ -527,7 +537,8 @@ def send_forward_backward_recv_forward_backward(output_tensor,
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
)
recv_next
=
recv_next
,
sync_recv
=
False
)
return
input_tensor
,
output_tensor_grad
...
...
@@ -544,7 +555,8 @@ def send_forward_recv_forward(output_tensor, recv_prev):
input_tensor
,
_
=
_p2p_helper
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
recv_prev
,
recv_next
=
False
)
recv_next
=
False
,
sync_recv
=
False
)
return
input_tensor
...
...
@@ -553,5 +565,6 @@ def send_backward_recv_backward(input_tensor_grad, recv_next):
_
,
output_tensor_grad
=
_p2p_helper
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
False
,
recv_next
=
recv_next
)
recv_next
=
recv_next
,
sync_recv
=
False
)
return
output_tensor_grad
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录