Expose Consecutive Hysteresis to Users (#3553)

Co-authored-by: N Jeff Rasley <jerasley@microsoft.com>

Expose Consecutive Hysteresis to Users (#3553)
Co-authored-by: N Jeff Rasley <jerasley@microsoft.com>
0411a9f8 · Quentin Anthony · GitHub · d39c311f · 0411a9f8 · 0411a9f8
6 changed file
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -18,6 +18,7 @@ from .fp16.loss_scaler import (
    INITIAL_LOSS_SCALE,
    SCALE_WINDOW,
    DELAYED_SHIFT,
+    CONSECUTIVE_HYSTERESIS,
    MIN_LOSS_SCALE,
 )
 from .config_utils import (
@@ -204,16 +205,20 @@ def get_dynamic_loss_scale_args(param_dict):
            FP16_LOSS_SCALE_WINDOW,
            FP16_MIN_LOSS_SCALE,
            FP16_HYSTERESIS,
+            FP16_CONSECUTIVE_HYSTERESIS,
        ]
        if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args):
            init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT)
            scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT)
            delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT)
+            consecutive_hysteresis = get_scalar_param(fp16_dict, FP16_CONSECUTIVE_HYSTERESIS,
+                                                      FP16_CONSECUTIVE_HYSTERESIS_DEFAULT)
            min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT)
            loss_scale_args = {
                INITIAL_LOSS_SCALE: 2**init_scale,
                SCALE_WINDOW: scale_window,
                DELAYED_SHIFT: delayed_shift,
+                CONSECUTIVE_HYSTERESIS: consecutive_hysteresis,
                MIN_LOSS_SCALE: min_loss_scale,
            }


--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -140,6 +140,7 @@ FP16 parameters should be of the format:
  "initial_scale_power": 16,
  "loss_scale_window": 1000,
  "hysteresis": 2,
+  "consecutive_hysteresis": false,
  "min_loss_scale": 1
 }
 '''
@@ -167,6 +168,10 @@ FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000
 FP16_HYSTERESIS = "hysteresis"
 FP16_HYSTERESIS_DEFAULT = 2

+# FP16 consecutive hysteresis
+FP16_CONSECUTIVE_HYSTERESIS = "consecutive_hysteresis"
+FP16_CONSECUTIVE_HYSTERESIS_DEFAULT = False
+
 # FP16 min loss scale
 FP16_MIN_LOSS_SCALE = "min_loss_scale"
 FP16_MIN_LOSS_SCALE_DEFAULT = 1

--- a/deepspeed/runtime/fp16/loss_scaler.py
+++ b/deepspeed/runtime/fp16/loss_scaler.py
@@ -28,6 +28,7 @@ from deepspeed.utils import logger
 INITIAL_LOSS_SCALE = 'init_scale'
 SCALE_WINDOW = 'scale_window'
 DELAYED_SHIFT = 'delayed_shift'
+CONSECUTIVE_HYSTERESIS = 'consecutive_hysteresis'
 MIN_LOSS_SCALE = 'min_scale'


@@ -111,6 +112,7 @@ class DynamicLossScaler(LossScalerBase):
        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+        consecutive_hysteresis (bool, optional, default=False): Whether to refill hysteresis if we reach an iteration that doesn't overflow
    """

    def __init__(self,
@@ -190,6 +192,9 @@ class DynamicLossScaler(LossScalerBase):
            self.last_overflow_iter = self.cur_iter
        else:
            if self.consecutive_hysteresis:
+                if dist.get_rank() == 0:
+                    hysteresis_msg = f"Consecutive hysteresis is enabled. Restoring hysteresis to {self.delayed_shift}"
+                    logger.info(hysteresis_msg)
                self.cur_hysteresis = self.delayed_shift
            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
                if not self.consecutive_hysteresis:

--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -224,6 +224,7 @@ Example of <i>**scheduler**</i>
    "initial_scale_power": 16,
    "loss_scale_window": 1000,
    "hysteresis": 2,
+    "consecutive_hysteresis": false,
    "min_loss_scale": 1
 }
 ```
@@ -264,6 +265,12 @@ Example of <i>**scheduler**</i>
 | --------------------------------------------------------------------------------------------------- | ------- |
 | <i>**hysteresis**</i> is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |

+<i>**fp16:consecutive_hysteresis**</i>: [boolean]
+
+| Description                                                                                         | Default |
+| --------------------------------------------------------------------------------------------------- | ------- |
+| <i>**consecutive_hysteresis**</i> is a **fp16** parameter representing whether to refill the hysteresis if we reach an iteration that doesn't overflow | `false`     |
+
 <i>**fp16:min_loss_scale**</i>: [integer]

 | Description                                                                                           | Default |

--- a/docs/_pages/training.md
+++ b/docs/_pages/training.md
@@ -201,6 +201,7 @@ Enable 16-bit (FP16) training by in the `deepspeed_config` JSON.
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "hysteresis": 2,
+    "consecutive_hysteresis": false,
    "min_loss_scale": 1
 }
 ```

--- a/docs/_tutorials/curriculum-learning.md
+++ b/docs/_tutorials/curriculum-learning.md
@@ -37,6 +37,7 @@ Curriculum learning can be used by setting the `curriculum_learning` key in the
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "hysteresis": 2,
+    "consecutive_hysteresis": false,
    "min_loss_scale": 1
  },
  "curriculum_learning": {