From 7a132a9f4b37959f951b7c04a05207aba6054965 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 27 Nov 2021 12:08:01 -0800 Subject: [PATCH] port OVERFLOW log to ZeRO-2 (#1593) --- deepspeed/runtime/zero/stage2.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 462e7af3..ebbc1564 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -1626,6 +1626,14 @@ class FP16_DeepSpeedZeroOptimizer(object): prev_scale = self.loss_scale self._update_scale(self.overflow) if self.overflow: + + if dist.get_rank() == 0: + logger.info( + "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, " + "reducing to {}".format(dist.get_rank(), + prev_scale, + self.loss_scale)) + see_memory_usage('After overflow before clearing gradients') self.zero_grad() if self.cpu_offload: -- GitLab