未验证 提交 340fc0cf 编写于 作者: S ShijieZZZZ 提交者: GitHub

Report progress at gradient accumulation boundary (#2553)

* report progress at gradient accumulation boundary

* format

* format
上级 21c28029
......@@ -2038,7 +2038,7 @@ class DeepSpeedEngine(Module):
assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
"must provide optimizer during init in order to use step"
report_progress = self.global_rank == 0 if self.global_rank else True
report_progress = False
self._step_applied = False # assume False, will flip to True
......@@ -2065,6 +2065,8 @@ class DeepSpeedEngine(Module):
else:
self._take_model_step(lr_kwargs)
report_progress = self.global_rank == 0 if self.global_rank else True
self.tput_timer.stop(report_progress)
self._stop_timers(self.engine_timers.step_timers)
......
......@@ -193,18 +193,19 @@ class ThroughputTimer:
curr_samples_sec = (self.batch_size * self.num_workers) / duration
if self.local_step_count % self.steps_per_output == 0:
if report_speed:
self.logging(
"{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
.format(self.epoch_count,
self.local_step_count,
self.avg_samples_per_sec(),
curr_samples_sec,
round(torch.cuda.memory_allocated() / 1024**3,
2),
round(torch.cuda.max_memory_allocated() / 1024**3,
2)))
if report_speed:
self.logging(
"{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
.format(
self.epoch_count,
self.local_step_count,
self.avg_samples_per_sec(),
curr_samples_sec,
round(torch.cuda.memory_allocated() / 1024**3,
2),
round(torch.cuda.max_memory_allocated() / 1024**3,
2),
))
if self.monitor_memory:
virt_mem = psutil.virtual_memory()
swap = psutil.swap_memory()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册