Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
xuexixianjinzhishi
Paddle
提交
a97b9df0
P
Paddle
项目概览
xuexixianjinzhishi
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a97b9df0
编写于
3月 03, 2021
作者:
S
sandyhouse
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
5294e51c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
38 addition
and
15 deletion
+38
-15
python/paddle/distributed/fleet/meta_optimizers/common.py
python/paddle/distributed/fleet/meta_optimizers/common.py
+9
-3
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+28
-12
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+1
-0
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/common.py
浏览文件 @
a97b9df0
...
...
@@ -66,14 +66,20 @@ class CollectiveHelper(object):
self
.
role_maker
.
_worker_index
(),
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
()
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
):
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
,
sync
=
True
):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
block
=
program
.
global_block
()
if
core
.
is_compiled_with_cuda
():
if
not
wait_port
:
if
not
wait_port
and
sync
:
temp_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'temp_var'
),
dtype
=
core
.
VarDesc
.
VarType
.
INT32
,
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
a97b9df0
...
...
@@ -96,6 +96,8 @@ class ShardingOptimizer(MetaOptimizerBase):
"use_pipeline"
]
self
.
acc_steps
=
self
.
user_defined_strategy
.
sharding_configs
[
"acc_steps"
]
self
.
schedule_mode
=
self
.
user_defined_strategy
.
sharding_configs
[
"schedule_mode"
]
if
self
.
inner_opt
is
None
:
raise
ValueError
(
...
...
@@ -105,6 +107,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self
.
acc_steps
)
main_program
=
loss
.
block
.
program
main_program
.
_pipeline_opt
=
dict
()
main_program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
pp_rank
=
self
.
role_maker
.
_worker_index
()
//
(
self
.
user_defined_strategy
.
sharding_configs
[
'sharding_group_size'
]
*
self
.
_inner_parallelism_size
)
...
...
@@ -409,20 +412,33 @@ class ShardingOptimizer(MetaOptimizerBase):
print
(
"pp_group_endpoints:"
,
self
.
pp_group_endpoints
)
print
(
"pp_rank:"
,
self
.
pp_rank
)
print
(
"pp_ring_id:"
,
self
.
pp_ring_id
)
for
pair
in
self
.
pipeline_pair
:
if
self
.
pp_rank
not
in
pair
:
continue
pp_group_endpoints
=
[
self
.
pp_group_endpoints
[
pair
[
0
]],
self
.
pp_group_endpoints
[
pair
[
1
]],
]
if
pair
[
0
]
<
pair
[
1
]:
start_ring_id
=
self
.
pp_ring_id
+
pair
[
1
]
-
pair
[
0
]
-
1
else
:
start_ring_id
=
self
.
pp_ring_id
+
2
+
pair
[
0
]
-
pair
[
1
]
-
1
pp_rank
=
0
if
self
.
pp_rank
==
pair
[
0
]
else
1
if
self
.
schedule_mode
==
0
:
# GPipe
self
.
_collective_helper
.
_init_communicator
(
self
.
_startup_program
,
self
.
current_endpoint
,
self
.
pp_group_endpoints
,
self
.
pp_rank
,
self
.
pp_ring_id
,
False
)
self
.
_collective_helper
.
_init_communicator
(
self
.
_startup_program
,
self
.
current_endpoint
,
pp_group_endpoints
,
pp_rank
,
start_ring_id
,
False
)
self
.
pp_group_endpoints
,
self
.
pp_rank
,
self
.
pp_ring_id
+
2
,
False
)
else
:
for
pair
in
self
.
pipeline_pair
:
print
(
"pp pair:{}"
.
format
(
pair
))
if
self
.
pp_rank
not
in
pair
:
continue
pp_group_endpoints
=
[
self
.
pp_group_endpoints
[
pair
[
0
]],
self
.
pp_group_endpoints
[
pair
[
1
]],
]
if
pair
[
0
]
<
pair
[
1
]:
start_ring_id
=
self
.
pp_ring_id
+
pair
[
1
]
-
pair
[
0
]
-
1
else
:
start_ring_id
=
self
.
pp_ring_id
+
2
+
pair
[
0
]
-
pair
[
1
]
-
1
pp_rank
=
0
if
self
.
pp_rank
==
pair
[
0
]
else
1
self
.
_collective_helper
.
_init_communicator
(
self
.
_startup_program
,
self
.
current_endpoint
,
pp_group_endpoints
,
pp_rank
,
start_ring_id
,
False
,
False
)
startup_block
=
self
.
_startup_program
.
global_block
()
startup_block
.
_sync_with_cpp
()
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
a97b9df0
...
...
@@ -415,6 +415,7 @@ class Section(DeviceWorker):
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
section_param
.
pipeline_stage
=
pipeline_opt
[
"pipeline_stage"
]
section_param
.
num_pipeline_stages
=
pipeline_opt
[
"num_pipeline_stages"
]
section_param
.
schedule_mode
=
pipeline_opt
[
"schedule_mode"
]
cfg
=
section_param
.
section_config
program
=
pipeline_opt
[
"section_program"
]
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录