Report progress at gradient accumulation boundary (#2553)

* report progress at gradient accumulation boundary * format * format

Report progress at gradient accumulation boundary (#2553)
* report progress at gradient accumulation boundary * format * format
340fc0cf · ShijieZZZZ · GitHub · 21c28029 · 340fc0cf · 340fc0cf
隐藏空白更改
内联并排

Showing with 16 addition and 13 deletion

deepspeed/runtime/engine.py deepspeed/runtime/engine.py +3 -1

deepspeed/utils/timer.py deepspeed/utils/timer.py +13 -12

未找到文件。
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -2038,7 +2038,7 @@ class DeepSpeedEngine(Module):
        assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
            "must provide optimizer during init in order to use step"

-        report_progress = self.global_rank == 0 if self.global_rank else True
+        report_progress = False

        self._step_applied = False  # assume False, will flip to True

@@ -2065,6 +2065,8 @@ class DeepSpeedEngine(Module):
            else:
                self._take_model_step(lr_kwargs)

+            report_progress = self.global_rank == 0 if self.global_rank else True
+
        self.tput_timer.stop(report_progress)

        self._stop_timers(self.engine_timers.step_timers)

--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -193,18 +193,19 @@ class ThroughputTimer:

            curr_samples_sec = (self.batch_size * self.num_workers) / duration

-            if self.local_step_count % self.steps_per_output == 0:
-                if report_speed:
-                    self.logging(
-                        "{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
-                        .format(self.epoch_count,
-                                self.local_step_count,
-                                self.avg_samples_per_sec(),
-                                curr_samples_sec,
-                                round(torch.cuda.memory_allocated() / 1024**3,
-                                      2),
-                                round(torch.cuda.max_memory_allocated() / 1024**3,
-                                      2)))
+            if report_speed:
+                self.logging(
+                    "{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
+                    .format(
+                        self.epoch_count,
+                        self.local_step_count,
+                        self.avg_samples_per_sec(),
+                        curr_samples_sec,
+                        round(torch.cuda.memory_allocated() / 1024**3,
+                              2),
+                        round(torch.cuda.max_memory_allocated() / 1024**3,
+                              2),
+                    ))
                if self.monitor_memory:
                    virt_mem = psutil.virtual_memory()
                    swap = psutil.swap_memory()