未验证 提交 77e289ae 编写于 作者: Z Zhang Ting 提交者: GitHub

[AMP] modify default value for GradScaler (#54653)

上级 db6f3ee6
......@@ -75,8 +75,8 @@ FP16_BLACK_LIST = {
'margin_cross_entropy',
}
# FP16 performance of grad op is worse than that of FP32. Use FP32 by default.
FP16_EXTRA_BLACK_LIST = {
# FP16/BF16 performance of grad op is worse than that of FP32. Use FP32 by default.
EXTRA_BLACK_LIST = {
'linear_interp_v2',
'nearest_interp_v2',
'bilinear_interp_v2',
......@@ -112,9 +112,13 @@ def black_list():
black_list = {
"float16": {
"OD": set(),
"O1": FP16_BLACK_LIST | FP16_EXTRA_BLACK_LIST,
"O2": FP16_EXTRA_BLACK_LIST,
"O1": FP16_BLACK_LIST | EXTRA_BLACK_LIST,
"O2": EXTRA_BLACK_LIST,
},
"bfloat16": {
"OD": set(),
"O1": BF16_BLACK_LIST | EXTRA_BLACK_LIST,
"O2": EXTRA_BLACK_LIST,
},
"bfloat16": {"OD": set(), "O1": BF16_BLACK_LIST, "O2": set()},
}
return black_list
......@@ -591,15 +591,15 @@ class GradScaler(AmpScaler):
Args:
enable(bool, optional): Enable loss scaling or not. Default is True.
init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
init_loss_scaling (float, optional): The initial loss scaling factor. Default is 65536.0.
incr_ratio(float, optional): The multiplier to use when increasing the loss
scaling. Default is 2.0.
decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
the loss scaling. Default is 0.5.
incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
steps with finite gradients. Default is 1000.
steps with finite gradients. Default is 2000.
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
accumulated steps with nan or inf gradients. Default is 2.
accumulated steps with nan or inf gradients. Default is 1.
use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
Returns:
An GradScaler object.
......@@ -628,11 +628,11 @@ class GradScaler(AmpScaler):
def __init__(
self,
enable=True,
init_loss_scaling=2.0**15,
init_loss_scaling=2.0**16,
incr_ratio=2.0,
decr_ratio=0.5,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
incr_every_n_steps=2000,
decr_every_n_nan_or_inf=1,
use_dynamic_loss_scaling=True,
):
super().__init__(
......
......@@ -811,11 +811,11 @@ def decorate( # noqa: F811
dtype='float16',
master_weight=None,
master_grad=False,
init_loss_scaling=2**15,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
init_loss_scaling=2**16,
incr_every_n_steps=2000,
decr_every_n_nan_or_inf=1,
incr_ratio=2.0,
decr_ratio=0.8,
decr_ratio=0.5,
use_dynamic_loss_scaling=None,
use_amp_guard=False,
use_promote=False,
......@@ -841,15 +841,15 @@ def decorate( # noqa: F811
during weight updating. If master_grad is False, in O2 level optimizer
will not use master grad. Default is False.
init_loss_scaling(float, optional): The initial loss scaling factor.
Default is 32768.
Default is 65536.
incr_every_n_steps(int, optional): Increases loss scaling every n
consecutive steps with finite gradients. Default is 1000.
consecutive steps with finite gradients. Default is 2000.
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
accumulated steps with nan or inf gradients. Default is 2.
accumulated steps with nan or inf gradients. Default is 1.
incr_ratio(float, optional): The multiplier to use when increasing the
loss scaling. Default is 2.
decr_ratio(float, optional): The less-than-one-multiplier to use when
decreasing the loss scaling. Default is 0.8.
decreasing the loss scaling. Default is 0.5.
use_dynamic_loss_scaling(bool, None): Whether to use dynamic loss
scaling. Default is None, which means True for float16, and False
for bfloat16.
......
......@@ -16,8 +16,8 @@ import copy
import logging
from paddle.amp.amp_lists import (
EXTRA_BLACK_LIST,
FP16_BLACK_LIST,
FP16_EXTRA_BLACK_LIST,
FP16_WHITE_LIST,
)
from paddle.fluid import core
......@@ -28,7 +28,7 @@ _logger = get_logger(
)
black_list = FP16_BLACK_LIST
_extra_black_list = FP16_EXTRA_BLACK_LIST
_extra_black_list = EXTRA_BLACK_LIST
white_list = FP16_WHITE_LIST
......@@ -138,7 +138,7 @@ def _get_white_list(dtype):
def _get_black_list():
_black_list = copy.copy(FP16_BLACK_LIST)
_black_list = _black_list | FP16_EXTRA_BLACK_LIST
_black_list = _black_list | EXTRA_BLACK_LIST
return _black_list
......
......@@ -182,7 +182,7 @@ def build_conv_model(
model = SimpleConvNet()
optimizer = _build_optimizer(use_amp=False, model=model)
if use_amp and amp_dtype == "float16":
scaler = paddle.amp.GradScaler()
scaler = paddle.amp.GradScaler(init_loss_scaling=32768.0)
else:
scaler = None
if use_amp and amp_level == "O2":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册