Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
bbf31a4e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
bbf31a4e
编写于
2月 18, 2022
作者:
Z
zhaoyingli
提交者:
GitHub
2月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
bug fix (#39630)
上级
8c7ee8c2
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
56 addition
and
27 deletion
+56
-27
python/paddle/distributed/auto_parallel/completion.py
python/paddle/distributed/auto_parallel/completion.py
+26
-4
python/paddle/distributed/auto_parallel/cost_model.py
python/paddle/distributed/auto_parallel/cost_model.py
+2
-1
python/paddle/distributed/auto_parallel/dist_op.py
python/paddle/distributed/auto_parallel/dist_op.py
+1
-1
python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
.../auto_parallel/operators/dist_check_finite_and_unscale.py
+2
-2
python/paddle/distributed/auto_parallel/parallelizer.py
python/paddle/distributed/auto_parallel/parallelizer.py
+2
-0
python/paddle/distributed/auto_parallel/partitioner.py
python/paddle/distributed/auto_parallel/partitioner.py
+1
-1
python/paddle/distributed/auto_parallel/reshard.py
python/paddle/distributed/auto_parallel/reshard.py
+7
-6
python/paddle/distributed/passes/auto_parallel_amp.py
python/paddle/distributed/passes/auto_parallel_amp.py
+13
-11
python/paddle/distributed/passes/auto_parallel_recompute.py
python/paddle/distributed/passes/auto_parallel_recompute.py
+1
-0
python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
...s/distributed_passes/test_auto_parallel_recompute_pass.py
+1
-1
未找到文件。
python/paddle/distributed/auto_parallel/completion.py
浏览文件 @
bbf31a4e
...
...
@@ -442,7 +442,7 @@ class Completer:
assert
forward_op
is
not
None
if
grad_op
.
type
==
"concat"
and
forward_op
.
type
==
"split"
:
forward_op_dist_attr
=
dist_context
.
get_op_dist_attr_for_program
(
forward_op_dist_attr
=
self
.
_
dist_context
.
get_op_dist_attr_for_program
(
forward_op
)
output_var
=
vars
[
grad_op
.
desc
.
output
(
'Out'
)[
0
]]
split_input_var_name
=
forward_op
.
input
(
"X"
)[
0
]
...
...
@@ -458,14 +458,14 @@ class Completer:
output_var_dist_attr
=
TensorDistributedAttribute
()
output_var_dist_attr
.
dims_mapping
=
ref_dims_mapping
output_var_dist_attr
.
process_mesh
=
ref_mesh
dist_context
.
set_tensor_dist_attr_for_program
(
self
.
_
dist_context
.
set_tensor_dist_attr_for_program
(
output_var
,
output_var_dist_attr
)
grad_op_dist_attr
.
set_output_dims_mapping
(
output_var
.
name
,
ref_dims_mapping
)
grad_op_dist_attr
.
process_mesh
=
ref_mesh
dist_context
.
set_op_dist_attr_for_program
(
grad_op
,
grad_op_dist_attr
)
self
.
_dist_context
.
set_op_dist_attr_for_program
(
grad_op
,
grad_op_dist_attr
)
continue
# op dist attr
...
...
@@ -579,6 +579,28 @@ class Completer:
# TODO to add attribute for moment var
op
=
ops
[
idx
]
if
int
(
op
.
attr
(
'op_role'
))
==
int
(
OpRole
.
Optimize
):
if
op
.
type
==
"clip_by_norm"
:
param_grad
=
vars
[
op
.
input
(
"X"
)[
0
]]
param_grad_dist_attr
=
self
.
_dist_context
.
get_tensor_dist_attr_for_program
(
param_grad
)
assert
param_grad_dist_attr
is
not
None
ref_process_mesh
=
param_grad_dist_attr
.
process_mesh
ref_dims_mapping
=
param_grad_dist_attr
.
dims_mapping
out
=
vars
[
op
.
output
(
"Out"
)[
0
]]
out_dist_attr
=
TensorDistributedAttribute
()
out_dist_attr
.
process_mesh
=
ref_process_mesh
out_dist_attr
.
dims_mapping
=
ref_dims_mapping
self
.
_dist_context
.
set_tensor_dist_attr_for_program
(
out
,
out_dist_attr
)
op_dist_attr
=
OperatorDistributedAttribute
()
op_dist_attr
.
process_mesh
=
ref_process_mesh
op_dist_attr
.
set_input_dist_attr
(
param_grad
.
name
,
param_grad_dist_attr
)
op_dist_attr
.
set_output_dist_attr
(
out
.
name
,
out_dist_attr
)
self
.
_dist_context
.
set_op_dist_attr_for_program
(
op
,
op_dist_attr
)
if
"Grad"
in
op
.
input_names
and
"Param"
in
ops
[
idx
].
input_names
:
assert
len
(
op
.
input
(
...
...
python/paddle/distributed/auto_parallel/cost_model.py
浏览文件 @
bbf31a4e
...
...
@@ -142,7 +142,8 @@ class TensorCostNode(CostNode):
elif
node
.
dtype
==
paddle
.
uint8
:
self
.
dtype_factor
=
1
else
:
raise
NotImplementedError
(
"{} not counted"
.
format
(
node
.
dtype
))
self
.
dtype_factor
=
2
# raise NotImplementedError("{} not counted".format(node.dtype))
self
.
batch_size
=
None
if
batch_size
is
not
None
:
self
.
batch_size
=
batch_size
...
...
python/paddle/distributed/auto_parallel/dist_op.py
浏览文件 @
bbf31a4e
...
...
@@ -86,7 +86,7 @@ class DistributedOperator:
tensor_dims_mapping
)
for
tensor_name
in
self
.
_serial_op
.
output_arg_names
:
tensor
=
self
.
_serial_op
.
block
.
_var_recursive
(
tensor_name
)
if
tensor
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
if
tensor
.
type
==
core
.
VarDesc
.
VarType
.
READER
or
tensor
.
type
==
core
.
VarDesc
.
VarType
.
STEP_SCOPES
:
tensor_shape
=
[]
else
:
tensor_shape
=
tensor
.
shape
...
...
python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
浏览文件 @
bbf31a4e
...
...
@@ -26,7 +26,7 @@ from ..process_group import new_process_group
from
..dist_attribute
import
OperatorDistributedAttribute
from
paddle.distributed.auto_parallel.process_group
import
get_world_process_group
global_process_mesh
=
get_world_process_group
().
ranks
world_process_group
=
get_world_process_group
()
class
DistributedCheckFiniteAndUnscale
(
DistributedOperatorImplContainer
):
...
...
@@ -119,7 +119,7 @@ class DistributedCheckFiniteAndUnscaleImpl(DistributedOperatorImpl):
main_block
.
_sync_with_cpp
()
# sync result
group
=
new_process_group
(
global_process_mesh
)
group
=
new_process_group
(
world_process_group
.
ranks
)
inf_var
=
main_block
.
var
(
kwargs
[
'FoundInfinite'
][
0
])
inf_var_int32
=
main_block
.
create_var
(
...
...
python/paddle/distributed/auto_parallel/parallelizer.py
浏览文件 @
bbf31a4e
...
...
@@ -222,6 +222,8 @@ class AutoParallelizer:
HAS_ALLGATHER
.
clear
()
_g_process_group_map
.
clear
()
_g_process_group_map
[
0
]
=
ProcessGroup
(
0
,
[])
for
process_mesh
in
dist_context
.
_process_meshes
:
_g_process_group_map
[
0
].
add_ranks
(
process_mesh
.
processes
)
return
dist_optimize_ops
,
dist_params_grads
,
dist_startup_prog
,
dist_main_prog
,
g_process_group_map
def
parallelize
(
self
,
...
...
python/paddle/distributed/auto_parallel/partitioner.py
浏览文件 @
bbf31a4e
...
...
@@ -381,7 +381,7 @@ def _get_dist_op_backward_implement(backward_op, dist_context,
op_dist_attr
=
dist_context
.
get_op_dist_attr_for_program
(
backward_op
)
assert
op_dist_attr
.
impl_idx
>=
0
dist_op_impl
=
get_distributed_operator_impl_container
(
backward_op
.
type
).
get_impl
(
op_dist_attr
.
impl_idx
)
op_dist_attr
.
impl_
type
).
get_impl
(
op_dist_attr
.
impl_idx
)
return
dist_op_impl
dist_op
=
get_distributed_operator_impl_container
(
"default"
)
...
...
python/paddle/distributed/auto_parallel/reshard.py
浏览文件 @
bbf31a4e
...
...
@@ -1013,18 +1013,18 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
assert
isinstance
(
dist_context
,
DistributedContext
),
"The type of dist_context should be DistributedContext, "
\
"but got {}."
.
format
(
type
(
dist_context
))
def
_is_special_op
(
op
):
global
_g_special_ops
if
op
.
type
in
_g_special_ops
:
return
True
return
False
block
=
auto_parallel_main_prog
.
global_block
()
idx
=
0
while
idx
<
len
(
block
.
ops
):
pre_op_count
=
len
(
block
.
ops
)
op
=
block
.
ops
[
idx
]
def
_is_special_op
(
op
):
global
_g_special_ops
if
op
.
type
in
_g_special_ops
:
return
True
return
False
if
_is_special_op
(
op
):
idx
+=
1
continue
...
...
@@ -1053,6 +1053,7 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
# insert send and recv op if output process mesh is different from tensor process mesh
idx
=
0
skip_ops
=
[
"create_py_reader"
,
"create_double_buffer_reader"
,
"read"
]
skip_ops
+=
_g_special_ops
while
idx
<
len
(
block
.
ops
):
pre_op_count
=
len
(
block
.
ops
)
op
=
block
.
ops
[
idx
]
...
...
python/paddle/distributed/passes/auto_parallel_amp.py
浏览文件 @
bbf31a4e
...
...
@@ -26,7 +26,7 @@ from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_fp32_input, _k
from
paddle.fluid.contrib.mixed_precision.fp16_utils
import
_valid_types
,
find_true_post_op
,
find_true_prev_op
from
paddle.fluid.contrib.mixed_precision.fp16_utils
import
_is_in_black_varnames
,
_dtype_to_str
,
_rename_arg
from
paddle.distributed.auto_parallel.dist_attribute
import
OperatorDistributedAttribute
global_process_mesh
=
get_world_process_group
().
ranks
world_process_group
=
get_world_process_group
()
class
AMPState
(
object
):
...
...
@@ -445,7 +445,7 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
,
persistable
=
False
,
stop_gradient
=
False
)
set_var_dist_attr
(
dist_context
,
found_inf
,
[
-
1
],
global_process_mesh
)
set_var_dist_attr
(
dist_context
,
found_inf
,
[
-
1
],
world_process_group
.
ranks
)
inputs
=
{
'X'
:
grads
,
'Scale'
:
loss_scaling
}
outputs
=
{
'Out'
:
grads
,
'FoundInfinite'
:
found_inf
}
...
...
@@ -457,9 +457,10 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
attrs
=
attrs
)
new_op_dist_attr
=
OperatorDistributedAttribute
()
new_op_dist_attr
.
process_mesh
=
global_process_mesh
if
len
(
global_process_mesh
)
>
1
:
new_op_dist_attr
.
impl_idx
=
0
new_op_dist_attr
.
process_mesh
=
world_process_group
.
ranks
new_op_dist_attr
.
impl_idx
=
0
if
len
(
world_process_group
.
ranks
)
>
1
:
new_op_dist_attr
.
impl_type
=
"check_finite_and_unscale"
for
g
in
grads
:
g_dist_attr
=
dist_context
.
get_tensor_dist_attr_for_program
(
g
)
assert
g_dist_attr
is
not
None
...
...
@@ -550,7 +551,7 @@ class AMPPass(PassBase):
dtype
=
'float32'
,
persistable
=
True
)
set_var_dist_attr
(
self
.
dist_context
,
self
.
_loss_scaling
,
[
-
1
],
global_process_mesh
)
world_process_group
.
ranks
)
if
self
.
get_attr
(
"use_dynamic_loss_scaling"
):
self
.
_num_good_steps
=
paddle
.
static
.
create_global_var
(
...
...
@@ -560,7 +561,7 @@ class AMPPass(PassBase):
dtype
=
'int32'
,
persistable
=
True
)
set_var_dist_attr
(
self
.
dist_context
,
self
.
_num_good_steps
,
[
-
1
],
global_process_mesh
)
world_process_group
.
ranks
)
self
.
_num_bad_steps
=
paddle
.
static
.
create_global_var
(
name
=
unique_name
.
generate
(
"num_bad_steps"
),
...
...
@@ -569,7 +570,7 @@ class AMPPass(PassBase):
dtype
=
'int32'
,
persistable
=
True
)
set_var_dist_attr
(
self
.
dist_context
,
self
.
_num_bad_steps
,
[
-
1
],
global_process_mesh
)
world_process_group
.
ranks
)
def
_scale_loss
(
self
):
...
...
@@ -700,9 +701,10 @@ class AMPPass(PassBase):
attrs
=
attrs
)
new_op_dist_attr
=
OperatorDistributedAttribute
()
new_op_dist_attr
.
process_mesh
=
global_process_mesh
if
len
(
global_process_mesh
)
>
1
:
new_op_dist_attr
.
impl_idx
=
0
new_op_dist_attr
.
process_mesh
=
world_process_group
.
ranks
new_op_dist_attr
.
impl_idx
=
0
if
len
(
world_process_group
.
ranks
)
>
1
:
new_op_dist_attr
.
impl_type
=
"update_loss_scaling"
for
g
in
grads
:
g_dist_attr
=
self
.
dist_context
.
get_tensor_dist_attr_for_program
(
g
)
assert
g_dist_attr
is
not
None
...
...
python/paddle/distributed/passes/auto_parallel_recompute.py
浏览文件 @
bbf31a4e
...
...
@@ -382,6 +382,7 @@ class RecomputePass(PassBase):
new_dist_attr
=
OperatorDistributedAttribute
()
new_dist_attr
.
is_recompute
=
True
new_dist_attr
.
impl_idx
=
old_dist_attr
.
impl_idx
new_dist_attr
.
impl_type
=
old_dist_attr
.
impl_type
new_dist_attr
.
process_mesh
=
old_dist_attr
.
process_mesh
for
input
in
old_dist_attr
.
inputs_dist_attrs
.
keys
():
if
input
in
var_name_dict
.
keys
():
...
...
python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
浏览文件 @
bbf31a4e
...
...
@@ -40,7 +40,7 @@ class TestRecomputePass(AutoPallelPassTestBase):
def
apply_passes
(
self
):
dist_strategy
=
fleet
.
DistributedStrategy
()
dist_strategy
.
recompute
=
True
dist_strategy
.
recompute_configs
=
{
"checkpoints"
:
[
"tmp
3"
,
"tmp
6"
]}
dist_strategy
.
recompute_configs
=
{
"checkpoints"
:
[
"tmp
_3"
,
"tmp_
6"
]}
dist_strategy
.
semi_auto
=
True
fleet
.
init
(
is_collective
=
True
,
strategy
=
dist_strategy
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录