Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
479efeeb
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
479efeeb
编写于
3月 08, 2021
作者:
R
root
提交者:
sandyhouse
3月 22, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
9ed5ae61
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
36 addition
and
20 deletion
+36
-20
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+28
-14
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+8
-6
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
479efeeb
...
...
@@ -206,13 +206,17 @@ class ShardingOptimizer(MetaOptimizerBase):
# if self._shard.has_param(param_name):
# param_list.append(param_name)
#pp_optimizer._clear_gradients(main_block, param_list)
accumulated_gradient_names
,
first_optimize_op_index
=
pp_optimizer
.
_accumulate_gradients
(
accumulated_grad_names
=
pp_optimizer
.
_accumulate_gradients
(
main_block
)
accumulated_grad_names
=
sorted
(
accumulated_grad_names
)
print
(
accumulated_grad_names
)
first_optimize_op_index
=
get_first_check_finite_and_unscale_op_idx
(
main_block
)
insert_reduce_ops
(
main_block
,
first_optimize_op_index
,
self
.
sharding_ring_id
,
accumulated_grad
ient
_names
,
accumulated_grad_names
,
self
.
_shard
,
OpRole
.
Optimize
,
use_calc_stream
=
True
)
...
...
@@ -466,15 +470,26 @@ class ShardingOptimizer(MetaOptimizerBase):
self
.
_main_program
.
global_block
())
def
_wait
(
self
,
):
# only the first parallelsm group that init nccl need to be wait.
if
self
.
_as_outer_parallelism
:
endpoints
=
self
.
global_group_endpoints
[:]
else
:
endpoints
=
self
.
sharding_group_endpoints
[:]
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
_worker_index
()]
if
self
.
sharding_rank
==
0
:
if
self
.
role_maker
.
_worker_index
()
==
0
:
self
.
_collective_helper
.
_wait
(
current_endpoint
,
endpoints
)
# def _wait(self, ):
# # only the first parallelsm group that init nccl need to be wait.
# if self._as_outer_parallelism:
# endpoints = self.role_maker._get_trainer_endpoints()
# else:
# endpoints = self.sharding_group_endpoints[:]
# current_endpoint = endpoints[self.role_maker._worker_index()]
# if self._as_outer_parallelism:
# if self.role_maker._worker_index() == 0:
# self._collective_helper._wait(current_endpoint, endpoints)
# else:
# if self.sharding_rank == 0:
# self._collective_helper._wait(current_endpoint, endpoints)
def
_split_program
(
self
,
block
):
for
op_idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
int
(
op
.
attr
(
'op_role'
))
!=
int
(
OpRole
.
Optimize
):
...
...
@@ -804,10 +819,10 @@ class ShardingOptimizer(MetaOptimizerBase):
def
_init_comm
(
self
):
# sharding alone mode
self
.
sharding_ring_id
=
0
self
.
sharding_rank
=
self
.
global_rank
self
.
sharding_group_endpoints
=
self
.
endpoints
[:]
self
.
sharding_group_size
=
len
(
self
.
endpoints
)
#
self.sharding_ring_id = 0
#
self.sharding_rank = self.global_rank
#
self.sharding_group_endpoints = self.endpoints[:]
#
self.sharding_group_size = len(self.endpoints)
if
self
.
hybrid_dp
:
assert
self
.
_as_outer_parallelism
==
False
,
"hybrid dp is conflict when using sharding as outer parallelism"
...
...
@@ -828,8 +843,7 @@ class ShardingOptimizer(MetaOptimizerBase):
ep
for
idx
,
ep
in
enumerate
(
self
.
endpoints
)
if
(
idx
%
self
.
sharding_group_size
)
==
self
.
sharding_rank
]
self
.
global_group_endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
(
)[:]
# self.global_group_endpoints = self.role_maker._get_trainer_endpoints()[:]
assert
self
.
global_word_size
>
self
.
sharding_group_size
,
\
"global_word_size: {} should be larger than sharding_group_size: {}"
.
format
(
self
.
global_word_size
,
self
.
sharding_group_size
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
479efeeb
...
...
@@ -4843,7 +4843,7 @@ class PipelineOptimizer(object):
Accumulate the gradients generated in microbatch to the one in mini-batch.
"""
# the name of real grad vars that should be allreduce
accumulated_gradient_names
=
[]
#
accumulated_gradient_names = []
first_optimize_op_index
=
None
accumulated_grad_names
=
[]
...
...
@@ -4875,15 +4875,16 @@ class PipelineOptimizer(object):
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
offset
=
0
param_name
=
op_role_var
[
i
]
if
not
block
.
has_var
(
param_name
):
continue
#
if not block.has_var(param_name): continue
if
'@BroadCast'
in
param_name
:
param_name
=
param_name
[
0
:
param_name
.
find
(
'@BroadCast'
)]
# clear gradient
param_grad_name
=
self
.
_append_grad_suffix
(
param_name
)
accumulated_grad_names
.
append
(
param_grad_name
)
if
not
block
.
has_var
(
param_grad_name
):
self
.
_create_var
(
block
,
block
.
vars
[
param_name
],
param_grad_name
)
self
.
_create_var
(
block
,
self
.
origin_main_block
.
vars
[
param_name
],
param_grad_name
)
assert
block
.
has_var
(
param_grad_name
)
param_grad_var
=
block
.
var
(
param_grad_name
)
param_grad_var
.
persistable
=
True
...
...
@@ -4924,7 +4925,7 @@ class PipelineOptimizer(object):
#self._op_role_var_key: op_role_var
})
#offset += 1
accumulated_gradient_names
.
append
(
real
_grad_var
.
name
)
# accumulated_gradient_names.append(param
_grad_var.name)
else
:
grad_name
=
op_role_var
[
i
+
1
]
# with _0 suffix
grad_var
=
block
.
vars
[
grad_name
]
...
...
@@ -4961,7 +4962,7 @@ class PipelineOptimizer(object):
# self._op_role_var_key: op_role_var
})
offset
+=
1
accumulated_gradient_names
.
append
(
fp32
_grad_var
.
name
)
# accumulated_gradient_names.append(param
_grad_var.name)
#real_grad_name = grad_name[0:grad_name.find(
# '@GRAD')] + '@GRAD'
#real_grad_var = block.vars[
...
...
@@ -5150,6 +5151,7 @@ class PipelineOptimizer(object):
parameter_list
=
None
,
no_grad_set
=
None
):
main_block
=
loss
.
block
self
.
origin_main_block
=
main_block
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录