未验证 提交 0411a9f8 编写于 作者: Q Quentin Anthony 提交者: GitHub

Expose Consecutive Hysteresis to Users (#3553)

Co-authored-by: NJeff Rasley <jerasley@microsoft.com>
上级 d39c311f
......@@ -18,6 +18,7 @@ from .fp16.loss_scaler import (
INITIAL_LOSS_SCALE,
SCALE_WINDOW,
DELAYED_SHIFT,
CONSECUTIVE_HYSTERESIS,
MIN_LOSS_SCALE,
)
from .config_utils import (
......@@ -204,16 +205,20 @@ def get_dynamic_loss_scale_args(param_dict):
FP16_LOSS_SCALE_WINDOW,
FP16_MIN_LOSS_SCALE,
FP16_HYSTERESIS,
FP16_CONSECUTIVE_HYSTERESIS,
]
if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args):
init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT)
scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT)
delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT)
consecutive_hysteresis = get_scalar_param(fp16_dict, FP16_CONSECUTIVE_HYSTERESIS,
FP16_CONSECUTIVE_HYSTERESIS_DEFAULT)
min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT)
loss_scale_args = {
INITIAL_LOSS_SCALE: 2**init_scale,
SCALE_WINDOW: scale_window,
DELAYED_SHIFT: delayed_shift,
CONSECUTIVE_HYSTERESIS: consecutive_hysteresis,
MIN_LOSS_SCALE: min_loss_scale,
}
......
......@@ -140,6 +140,7 @@ FP16 parameters should be of the format:
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
}
'''
......@@ -167,6 +168,10 @@ FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000
FP16_HYSTERESIS = "hysteresis"
FP16_HYSTERESIS_DEFAULT = 2
# FP16 consecutive hysteresis
FP16_CONSECUTIVE_HYSTERESIS = "consecutive_hysteresis"
FP16_CONSECUTIVE_HYSTERESIS_DEFAULT = False
# FP16 min loss scale
FP16_MIN_LOSS_SCALE = "min_loss_scale"
FP16_MIN_LOSS_SCALE_DEFAULT = 1
......
......@@ -28,6 +28,7 @@ from deepspeed.utils import logger
INITIAL_LOSS_SCALE = 'init_scale'
SCALE_WINDOW = 'scale_window'
DELAYED_SHIFT = 'delayed_shift'
CONSECUTIVE_HYSTERESIS = 'consecutive_hysteresis'
MIN_LOSS_SCALE = 'min_scale'
......@@ -111,6 +112,7 @@ class DynamicLossScaler(LossScalerBase):
init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
consecutive_hysteresis (bool, optional, default=False): Whether to refill hysteresis if we reach an iteration that doesn't overflow
"""
def __init__(self,
......@@ -190,6 +192,9 @@ class DynamicLossScaler(LossScalerBase):
self.last_overflow_iter = self.cur_iter
else:
if self.consecutive_hysteresis:
if dist.get_rank() == 0:
hysteresis_msg = f"Consecutive hysteresis is enabled. Restoring hysteresis to {self.delayed_shift}"
logger.info(hysteresis_msg)
self.cur_hysteresis = self.delayed_shift
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
if not self.consecutive_hysteresis:
......
......@@ -224,6 +224,7 @@ Example of <i>**scheduler**</i>
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
}
```
......@@ -264,6 +265,12 @@ Example of <i>**scheduler**</i>
| --------------------------------------------------------------------------------------------------- | ------- |
| <i>**hysteresis**</i> is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2` |
<i>**fp16:consecutive_hysteresis**</i>: [boolean]
| Description | Default |
| --------------------------------------------------------------------------------------------------- | ------- |
| <i>**consecutive_hysteresis**</i> is a **fp16** parameter representing whether to refill the hysteresis if we reach an iteration that doesn't overflow | `false` |
<i>**fp16:min_loss_scale**</i>: [integer]
| Description | Default |
......
......@@ -201,6 +201,7 @@ Enable 16-bit (FP16) training by in the `deepspeed_config` JSON.
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
}
```
......
......@@ -37,6 +37,7 @@ Curriculum learning can be used by setting the `curriculum_learning` key in the
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
},
"curriculum_learning": {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册