diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 2777fd728a8114277dce9e22fa4432c9d31db400..580b72ee119f5f72456d8e6876883ed4aa5cee7c 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1239,6 +1239,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): def __partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None: + offload_fp32_gradients = {} + offload_fp32_offsets = {} for param, grad_partition in zip(params_to_release, grad_partitions): if param.partition_numel() * dist.get_rank( self.dp_process_group) > param.ds_numel: @@ -1280,8 +1282,6 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): # offload the gradient partition if applicable if self.offload_optimizer: i, dest_offset, _ = self.grad_position[self.get_param_id(param)] - offload_fp32_gradients = {} - offload_fp32_offsets = {} if self.is_gradient_accumulation_boundary: self.norm_for_param_grads[self.get_param_id(