未验证 提交 99bd592d 编写于 作者: O Olatunji Ruwase 提交者: GitHub

Synchronize folder creation; Single latest file writer (#1486)

* Synchronize folder creation; Single latest file writer

* Address PR feedback
上级 24dd285f
......@@ -2271,6 +2271,7 @@ class DeepSpeedEngine(Module):
# Ensure save_dir directory exists
os.makedirs(save_dir, exist_ok=True)
torch.distributed.barrier()
if tag is None:
tag = f"global_step{self.global_steps}"
......@@ -2294,14 +2295,15 @@ class DeepSpeedEngine(Module):
self._create_zero_checkpoint_files(save_dir, tag)
self._save_zero_checkpoint(save_dir, tag)
if self.zero_optimization_partition_weights():
self.optimizer.save_checkpoint_epilogue()
# Save latest checkpoint tag
if save_latest:
torch.distributed.barrier()
if save_latest and self.global_rank == 0:
with open(os.path.join(save_dir, 'latest'), 'w') as fd:
fd.write(tag)
if self.zero_optimization_partition_weights():
self.optimizer.save_checkpoint_epilogue()
return True
def _get_moe_state_dict(self, full_state_dict, num_local_experts, expp_rank):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册