Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
382e9a06
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
382e9a06
编写于
1月 30, 2023
作者:
W
wanghuancoder
提交者:
GitHub
1月 30, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine amp scaler found_inf (#49864)
* refine _found_inf
上级
320958eb
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
142 addition
and
97 deletion
+142
-97
python/paddle/amp/grad_scaler.py
python/paddle/amp/grad_scaler.py
+25
-17
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
...ptimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+3
-5
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+11
-6
python/paddle/distributed/fleet/scaler.py
python/paddle/distributed/fleet/scaler.py
+10
-5
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+12
-5
python/paddle/optimizer/adam.py
python/paddle/optimizer/adam.py
+23
-18
python/paddle/optimizer/adamw.py
python/paddle/optimizer/adamw.py
+1
-2
python/paddle/optimizer/lamb.py
python/paddle/optimizer/lamb.py
+2
-2
python/paddle/optimizer/momentum.py
python/paddle/optimizer/momentum.py
+24
-13
python/paddle/optimizer/optimizer.py
python/paddle/optimizer/optimizer.py
+31
-24
未找到文件。
python/paddle/amp/grad_scaler.py
浏览文件 @
382e9a06
...
...
@@ -18,7 +18,7 @@ from enum import Enum
import
numpy
as
np
from
paddle
import
_legacy_C_ops
from
paddle
import
_
C_ops
,
_
legacy_C_ops
from
paddle.fluid
import
core
,
in_dygraph_mode
from
paddle.fluid.data_feeder
import
check_type
from
paddle.fluid.dygraph
import
to_variable
...
...
@@ -228,11 +228,9 @@ class AmpScaler:
optimize_ops
,
params_grads
=
(
None
,
None
)
if
self
.
_found_inf
:
self
.
_cache_founf_inf
=
True
else
:
optimize_ops
,
params_grads
=
optimizer
.
minimize
(
*
args
,
**
kwargs
)
self
.
_cache_founf_inf
=
False
optimizer
.
_set_auxiliary_var
(
'found_inf'
,
self
.
_found_inf
)
optimize_ops
,
params_grads
=
optimizer
.
minimize
(
*
args
,
**
kwargs
)
self
.
_cache_founf_inf
=
optimizer
.
_get_auxiliary_var
(
'found_inf'
)
if
self
.
_use_dynamic_loss_scaling
:
# uopdate the scale
...
...
@@ -330,6 +328,9 @@ class AmpScaler:
param_grads_fp16
,
self
.
_temp_found_inf_fp16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_fp16
)
if
len
(
param_grads_bf16
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_bf16
,
...
...
@@ -338,6 +339,9 @@ class AmpScaler:
param_grads_bf16
,
self
.
_temp_found_inf_bf16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_bf16
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
...
...
@@ -346,6 +350,9 @@ class AmpScaler:
param_grads_fp32
,
self
.
_temp_found_inf_fp32
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_fp32
)
else
:
if
len
(
param_grads_fp16
):
_legacy_C_ops
.
check_finite_and_unscale
(
...
...
@@ -354,6 +361,9 @@ class AmpScaler:
param_grads_fp16
,
self
.
_temp_found_inf_fp16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_fp16
)
if
len
(
param_grads_bf16
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_bf16
,
...
...
@@ -361,6 +371,9 @@ class AmpScaler:
param_grads_bf16
,
self
.
_temp_found_inf_bf16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_bf16
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
...
...
@@ -368,12 +381,9 @@ class AmpScaler:
param_grads_fp32
,
self
.
_temp_found_inf_fp32
,
)
self
.
_found_inf
=
(
self
.
_temp_found_inf_fp16
or
self
.
_temp_found_inf_bf16
or
self
.
_temp_found_inf_fp32
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
self
.
_temp_found_inf_fp32
)
optimizer_state
[
"state"
]
=
OptimizerState
.
UNSCALED
...
...
@@ -761,11 +771,9 @@ class GradScaler(AmpScaler):
if
optimizer_state
[
"state"
]
is
OptimizerState
.
INIT
:
self
.
_unscale
(
optimizer
)
if
self
.
_found_inf
:
self
.
_cache_founf_inf
=
True
else
:
optimizer
.
step
()
self
.
_cache_founf_inf
=
False
optimizer
.
_set_auxiliary_var
(
'found_inf'
,
self
.
_found_inf
)
optimizer
.
step
()
self
.
_cache_founf_inf
=
optimizer
.
_get_auxiliary_var
(
'found_inf'
)
optimizer_state
[
"state"
]
=
OptimizerState
.
STEPPED
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
浏览文件 @
382e9a06
...
...
@@ -41,11 +41,9 @@ class HybridParallelGradScaler:
optimize_ops
,
params_grads
=
(
None
,
None
)
if
self
.
_found_inf
:
self
.
_cache_founf_inf
=
True
else
:
optimize_ops
,
params_grads
=
optimizer
.
minimize
(
*
args
,
**
kwargs
)
self
.
_cache_founf_inf
=
False
optimizer
.
_set_auxiliary_var
(
'found_inf'
,
self
.
_found_inf
)
optimize_ops
,
params_grads
=
optimizer
.
minimize
(
*
args
,
**
kwargs
)
self
.
_cache_founf_inf
=
optimizer
.
_get_auxiliary_var
(
'found_inf'
)
if
self
.
_use_dynamic_loss_scaling
:
self
.
_update
()
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
382e9a06
...
...
@@ -19,10 +19,10 @@ from types import MethodType
import
numpy
as
np
import
paddle
from
paddle
import
_legacy_C_ops
from
paddle
import
_
C_ops
,
_
legacy_C_ops
from
paddle.common_ops_import
import
dygraph_only
from
paddle.fluid
import
core
from
paddle.fluid.dygraph
import
to_variable
from
paddle.framework
import
core
from
paddle.nn
import
clip
...
...
@@ -231,6 +231,9 @@ def GroupShardedScaler(scaler):
param_grads_fp16
,
temp_found_inf_fp16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
temp_found_inf_fp16
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
...
...
@@ -238,15 +241,17 @@ def GroupShardedScaler(scaler):
param_grads_fp32
,
temp_found_inf_fp32
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
temp_found_inf_fp32
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
self
.
_found_inf
=
self
.
_found_inf
.
cast
(
"int32"
)
paddle
.
distributed
.
all_reduce
(
is_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
SUM
,
group
=
None
self
.
_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
None
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
self
.
_found_inf
=
self
.
_found_inf
.
cast
(
"bool"
)
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
return
scaler
...
...
python/paddle/distributed/fleet/scaler.py
浏览文件 @
382e9a06
...
...
@@ -17,7 +17,7 @@ from types import MethodType
import
numpy
as
np
import
paddle
from
paddle
import
_legacy_C_ops
from
paddle
import
_
C_ops
,
_
legacy_C_ops
from
paddle.distributed
import
fleet
from
paddle.fluid.dygraph
import
to_variable
from
paddle.framework
import
core
...
...
@@ -73,6 +73,9 @@ def distributed_scaler(scaler):
param_grads_fp16
,
temp_found_inf_fp16
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
temp_found_inf_fp16
)
if
len
(
param_grads_fp32
):
_legacy_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
...
...
@@ -80,17 +83,19 @@ def distributed_scaler(scaler):
param_grads_fp32
,
temp_found_inf_fp32
,
)
self
.
_found_inf
=
_C_ops
.
bitwise_or
(
self
.
_found_inf
,
temp_found_inf_fp32
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
self
.
_found_inf
=
self
.
_found_inf
.
cast
(
"int32"
)
# TODO(shenliang03) Since dp allreduce in the optimizer is
# after the gradscaler, check_finite needs to synchronize global
# information. In the future, we should use check_group to speed.
paddle
.
distributed
.
all_reduce
(
is
_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
None
self
.
_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
None
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
self
.
_found_inf
=
self
.
_found_inf
.
cast
(
"bool"
)
# Only data_parallel doesn't need to modify scaler
fleet_env
=
fleet
.
fleet
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
382e9a06
...
...
@@ -893,11 +893,18 @@ class Optimizer:
self
.
_create_global_learning_rate
()
if
in_dygraph_mode
():
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
trainable
is
True
:
self
.
_append_optimize_op
(
target_block
,
param_and_grad
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
True
)
else
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
False
)
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
trainable
is
True
:
self
.
_append_optimize_op
(
target_block
,
param_and_grad
)
else
:
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
...
...
python/paddle/optimizer/adam.py
浏览文件 @
382e9a06
...
...
@@ -360,8 +360,6 @@ class Adam(Optimizer):
# create the adam optimize op
if
framework
.
in_dygraph_mode
():
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
_beta1
=
(
self
.
_beta1
if
not
isinstance
(
self
.
_beta1
,
Variable
)
...
...
@@ -382,7 +380,7 @@ class Adam(Optimizer):
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
found_inf
,
None
,
_beta1
,
_beta2
,
self
.
_epsilon
,
...
...
@@ -693,21 +691,28 @@ class Adam(Optimizer):
if
master_weight
is
not
None
else
None
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
merged_adam_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
lr_dict
[
key
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
_beta1
,
_beta2
,
self
.
_epsilon
,
find_master
,
False
,
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
True
)
else
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
False
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
merged_adam_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
lr_dict
[
key
],
self
.
_moment1_dict
[
key
][
param_group_idx
],
self
.
_moment2_dict
[
key
][
param_group_idx
],
self
.
_beta1_pow_acc_dict
[
key
][
param_group_idx
],
self
.
_beta2_pow_acc_dict
[
key
][
param_group_idx
],
master_weight
,
_beta1
,
_beta2
,
self
.
_epsilon
,
find_master
,
False
,
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
][
param_group_idx
],
...
...
python/paddle/optimizer/adamw.py
浏览文件 @
382e9a06
...
...
@@ -491,7 +491,6 @@ class AdamW(Optimizer):
else
self
.
_beta2
.
numpy
().
item
(
0
)
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
_
,
_
,
_
,
_
,
_
,
_
=
_C_ops
.
adamw_
(
param_and_grad
[
0
],
param_and_grad
[
1
],
...
...
@@ -501,7 +500,7 @@ class AdamW(Optimizer):
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
found_inf
,
None
,
_beta1
,
_beta2
,
self
.
_epsilon
,
...
...
python/paddle/optimizer/lamb.py
浏览文件 @
382e9a06
...
...
@@ -293,7 +293,6 @@ class Lamb(Optimizer):
self
.
_used_master_weights
[
p_name
]
=
master_weight
.
name
else
:
master_weight
=
None
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
framework
.
in_dygraph_mode
():
_C_ops
.
lamb_
(
...
...
@@ -305,7 +304,7 @@ class Lamb(Optimizer):
beta1_pow_acc
,
beta2_pow_acc
,
master_weight
,
found_inf
,
None
,
weight_decay
,
self
.
_beta1
,
self
.
_beta2
,
...
...
@@ -343,6 +342,7 @@ class Lamb(Optimizer):
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
inputs
[
"SkipUpdate"
]
=
found_inf
...
...
python/paddle/optimizer/momentum.py
浏览文件 @
382e9a06
...
...
@@ -530,19 +530,30 @@ class Momentum(Optimizer):
)
if
in_dygraph_mode
():
_
,
_
,
_
=
_C_ops
.
merged_momentum_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
lr_dict
[
key
],
master_weight
,
self
.
_momentum
,
self
.
_use_nesterov
,
self
.
_regularization_method_dict
[
key
][
param_group_idx
],
self
.
_regularization_coeff_dict
[
key
][
param_group_idx
],
find_master
,
self
.
_rescale_grad
,
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
True
)
else
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
False
)
_
,
_
,
_
=
_C_ops
.
merged_momentum_
(
self
.
_param_dict
[
key
][
param_group_idx
],
grad_dict
[
key
],
self
.
_velocity_dict
[
key
][
param_group_idx
],
lr_dict
[
key
],
master_weight
,
self
.
_momentum
,
self
.
_use_nesterov
,
self
.
_regularization_method_dict
[
key
][
param_group_idx
],
self
.
_regularization_coeff_dict
[
key
][
param_group_idx
],
find_master
,
self
.
_rescale_grad
,
)
else
:
inputs
=
{
"Param"
:
self
.
_param_dict
[
key
][
param_group_idx
],
...
...
python/paddle/optimizer/optimizer.py
浏览文件 @
382e9a06
...
...
@@ -920,31 +920,38 @@ class Optimizer:
self
.
_create_accumulators
(
target_block
,
params_acc_dict
)
if
framework
.
_non_static_mode
():
if
isinstance
(
parameters_and_grads
,
list
):
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
stop_gradient
is
False
:
self
.
_append_optimize_op
(
target_block
,
param_and_grad
)
found_inf
=
self
.
_get_auxiliary_var
(
'found_inf'
)
if
found_inf
:
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
True
)
else
:
for
param_and_grad
in
parameters_and_grads
[
'params'
]:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
stop_gradient
is
False
:
param_grad_dict
=
dict
()
param_grad_dict
[
'params'
]
=
param_and_grad
param_grad_dict
.
update
(
{
k
:
v
for
k
,
v
in
parameters_and_grads
.
items
()
if
k
!=
'params'
}
)
self
.
_append_optimize_op
(
target_block
,
param_grad_dict
)
if
isinstance
(
found_inf
,
core
.
eager
.
Tensor
):
self
.
_set_auxiliary_var
(
'found_inf'
,
False
)
if
isinstance
(
parameters_and_grads
,
list
):
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
stop_gradient
is
False
:
self
.
_append_optimize_op
(
target_block
,
param_and_grad
)
else
:
for
param_and_grad
in
parameters_and_grads
[
'params'
]:
if
param_and_grad
[
1
]
is
None
:
continue
if
param_and_grad
[
0
].
stop_gradient
is
False
:
param_grad_dict
=
dict
()
param_grad_dict
[
'params'
]
=
param_and_grad
param_grad_dict
.
update
(
{
k
:
v
for
k
,
v
in
parameters_and_grads
.
items
()
if
k
!=
'params'
}
)
self
.
_append_optimize_op
(
target_block
,
param_grad_dict
)
else
:
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录