[AMP] modify default value for GradScaler (#54653)

77e289ae · Zhang Ting · GitHub · db6f3ee6 · 77e289ae · 77e289ae
5 changed file
--- a/python/paddle/amp/amp_lists.py
+++ b/python/paddle/amp/amp_lists.py
@@ -75,8 +75,8 @@ FP16_BLACK_LIST = {
    'margin_cross_entropy',
 }
-# FP16 performance of grad op is worse than that of FP32. Use FP32 by default.
+# FP16/BF16 performance of grad op is worse than that of FP32. Use FP32 by default.
-FP16_EXTRA_BLACK_LIST = {
+EXTRA_BLACK_LIST = {
    'linear_interp_v2',
    'nearest_interp_v2',
    'bilinear_interp_v2',
@@ -112,9 +112,13 @@ def black_list():
    black_list = {
        "float16": {
            "OD": set(),
-            "O1": FP16_BLACK_LIST | FP16_EXTRA_BLACK_LIST,
+            "O1": FP16_BLACK_LIST | EXTRA_BLACK_LIST,
-            "O2": FP16_EXTRA_BLACK_LIST,
+            "O2": EXTRA_BLACK_LIST,
+        },
+        "bfloat16": {
+            "OD": set(),
+            "O1": BF16_BLACK_LIST | EXTRA_BLACK_LIST,
+            "O2": EXTRA_BLACK_LIST,
        },
-        "bfloat16": {"OD": set(), "O1": BF16_BLACK_LIST, "O2": set()},
    }
    return black_list
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -591,15 +591,15 @@ class GradScaler(AmpScaler):
    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
-        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
+        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 65536.0.
        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
-                                steps with finite gradients. Default is 1000.
+                                steps with finite gradients. Default is 2000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
-                                    accumulated steps with nan or inf gradients. Default is 2.
+                                    accumulated steps with nan or inf gradients. Default is 1.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
    Returns:
        An GradScaler object.
@@ -628,11 +628,11 @@ class GradScaler(AmpScaler):
    def __init__(
        self,
        enable=True,
-        init_loss_scaling=2.0**15,
+        init_loss_scaling=2.0**16,
        incr_ratio=2.0,
        decr_ratio=0.5,
-        incr_every_n_steps=1000,
+        incr_every_n_steps=2000,
-        decr_every_n_nan_or_inf=2,
+        decr_every_n_nan_or_inf=1,
        use_dynamic_loss_scaling=True,
    ):
        super().__init__(

--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -811,11 +811,11 @@ def decorate(  # noqa: F811
    dtype='float16',
    master_weight=None,
    master_grad=False,
-    init_loss_scaling=2**15,
+    init_loss_scaling=2**16,
-    incr_every_n_steps=1000,
+    incr_every_n_steps=2000,
-    decr_every_n_nan_or_inf=2,
+    decr_every_n_nan_or_inf=1,
    incr_ratio=2.0,
-    decr_ratio=0.8,
+    decr_ratio=0.5,
    use_dynamic_loss_scaling=None,
    use_amp_guard=False,
    use_promote=False,
@@ -841,15 +841,15 @@ def decorate(  # noqa: F811
            during weight updating. If master_grad is False, in O2 level optimizer
            will not use master grad. Default is False.
        init_loss_scaling(float, optional): The initial loss scaling factor.
-            Default is 32768.
+            Default is 65536.
        incr_every_n_steps(int, optional): Increases loss scaling every n
-            consecutive steps with finite gradients. Default is 1000.
+            consecutive steps with finite gradients. Default is 2000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
-            accumulated steps with nan or inf gradients. Default is 2.
+            accumulated steps with nan or inf gradients. Default is 1.
        incr_ratio(float, optional): The multiplier to use when increasing the
            loss scaling. Default is 2.
        decr_ratio(float, optional): The less-than-one-multiplier to use when
-            decreasing the loss scaling. Default is 0.8.
+            decreasing the loss scaling. Default is 0.5.
        use_dynamic_loss_scaling(bool, None): Whether to use dynamic loss
            scaling. Default is None, which means True for float16, and False
            for bfloat16.

--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -16,8 +16,8 @@ import copy
 import logging
 from paddle.amp.amp_lists import (
+    EXTRA_BLACK_LIST,
    FP16_BLACK_LIST,
-    FP16_EXTRA_BLACK_LIST,
    FP16_WHITE_LIST,
 )
 from paddle.fluid import core
@@ -28,7 +28,7 @@ _logger = get_logger(
 )
 black_list = FP16_BLACK_LIST
-_extra_black_list = FP16_EXTRA_BLACK_LIST
+_extra_black_list = EXTRA_BLACK_LIST
 white_list = FP16_WHITE_LIST
@@ -138,7 +138,7 @@ def _get_white_list(dtype):
 def _get_black_list():
    _black_list = copy.copy(FP16_BLACK_LIST)
-    _black_list = _black_list | FP16_EXTRA_BLACK_LIST
+    _black_list = _black_list | EXTRA_BLACK_LIST
    return _black_list

--- a/test/amp/amp_base_models.py
+++ b/test/amp/amp_base_models.py
@@ -182,7 +182,7 @@ def build_conv_model(
        model = SimpleConvNet()
        optimizer = _build_optimizer(use_amp=False, model=model)
        if use_amp and amp_dtype == "float16":
-            scaler = paddle.amp.GradScaler()
+            scaler = paddle.amp.GradScaler(init_loss_scaling=32768.0)
        else:
            scaler = None
        if use_amp and amp_level == "O2":