未验证 提交 c8641032 编写于 作者: O Olatunji Ruwase 提交者: GitHub

Offload all gradients to nvme (#2282)

上级 b146aa35
...@@ -1239,6 +1239,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1239,6 +1239,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
def __partition_grads(self, def __partition_grads(self,
params_to_release: List[Parameter], params_to_release: List[Parameter],
grad_partitions: List[Tensor]) -> None: grad_partitions: List[Tensor]) -> None:
offload_fp32_gradients = {}
offload_fp32_offsets = {}
for param, grad_partition in zip(params_to_release, grad_partitions): for param, grad_partition in zip(params_to_release, grad_partitions):
if param.partition_numel() * dist.get_rank( if param.partition_numel() * dist.get_rank(
self.dp_process_group) > param.ds_numel: self.dp_process_group) > param.ds_numel:
...@@ -1280,8 +1282,6 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1280,8 +1282,6 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
# offload the gradient partition if applicable # offload the gradient partition if applicable
if self.offload_optimizer: if self.offload_optimizer:
i, dest_offset, _ = self.grad_position[self.get_param_id(param)] i, dest_offset, _ = self.grad_position[self.get_param_id(param)]
offload_fp32_gradients = {}
offload_fp32_offsets = {}
if self.is_gradient_accumulation_boundary: if self.is_gradient_accumulation_boundary:
self.norm_for_param_grads[self.get_param_id( self.norm_for_param_grads[self.get_param_id(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册