未验证 提交 34a11688 编写于 作者: L loadams 提交者: GitHub

Change zero_grad() argument to match pytorch (#2741)

上级 867da307
...@@ -130,14 +130,14 @@ class FP16_Optimizer(DeepSpeedOptimizer): ...@@ -130,14 +130,14 @@ class FP16_Optimizer(DeepSpeedOptimizer):
return return
def zero_grad(self, set_grads_to_None=True): def zero_grad(self, set_to_none=False):
""" """
Zero FP16 parameter grads. Zero FP16 parameter grads.
""" """
# For speed, set model fp16 grad to None by default # For speed, set model fp16 grad to None by default
for group in self.fp16_groups: for group in self.fp16_groups:
for p in group: for p in group:
if set_grads_to_None: if set_to_none:
p.grad = None p.grad = None
else: else:
if p.grad is not None: if p.grad is not None:
......
...@@ -110,7 +110,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): ...@@ -110,7 +110,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
self.initialize_optimizer_states() self.initialize_optimizer_states()
def zero_grad(self, set_grads_to_None=True): def zero_grad(self, set_to_none=False):
""" """
Zero FP16 parameter grads. Zero FP16 parameter grads.
""" """
...@@ -118,7 +118,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer): ...@@ -118,7 +118,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
# For speed, set model fp16 grad to None by default # For speed, set model fp16 grad to None by default
for group in self.fp16_groups: for group in self.fp16_groups:
for p in group: for p in group:
if set_grads_to_None: if set_to_none:
p.grad = None p.grad = None
else: else:
if p.grad is not None: if p.grad is not None:
......
...@@ -1509,7 +1509,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1509,7 +1509,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
return params_in_partition, params_not_in_partition, first_offset return params_in_partition, params_not_in_partition, first_offset
@instrument_w_nvtx @instrument_w_nvtx
def zero_grad(self, set_grads_to_None=True): def zero_grad(self, set_to_none=False):
""" """
Zero FP16 parameter grads. Zero FP16 parameter grads.
""" """
...@@ -1519,7 +1519,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1519,7 +1519,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
# For speed, set model fp16 grad to None by default # For speed, set model fp16 grad to None by default
for group in self.fp16_groups: for group in self.fp16_groups:
for p in group: for p in group:
if set_grads_to_None: if set_to_none:
if p.grad is not None and p.grad.is_cuda: if p.grad is not None and p.grad.is_cuda:
p.grad.record_stream(torch.cuda.current_stream()) p.grad.record_stream(torch.cuda.current_stream())
p.grad = None p.grad = None
...@@ -1708,7 +1708,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1708,7 +1708,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition
# release all the gradient since we have already created a necessary copy in dp_grad_partition # release all the gradient since we have already created a necessary copy in dp_grad_partition
self.zero_grad() self.zero_grad(set_to_none=True)
for grad in filter(lambda g: g.is_cuda, self.averaged_gradients[sub_group_id]): for grad in filter(lambda g: g.is_cuda, self.averaged_gradients[sub_group_id]):
grad.record_stream(torch.cuda.current_stream()) grad.record_stream(torch.cuda.current_stream())
...@@ -1816,7 +1816,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer): ...@@ -1816,7 +1816,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
def _overflow_clean_up(self, prev_scale): def _overflow_clean_up(self, prev_scale):
see_memory_usage('After overflow before clearing gradients', force=False) see_memory_usage('After overflow before clearing gradients', force=False)
self.zero_grad() self.zero_grad(set_to_none=True)
if self.offload_optimizer: if self.offload_optimizer:
self.reset_cpu_buffers() self.reset_cpu_buffers()
......
...@@ -753,7 +753,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): ...@@ -753,7 +753,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# No need to keep the gradients anymore. # No need to keep the gradients anymore.
# All gradients required by the step # All gradients required by the step
# are in self.averaged_gradients # are in self.averaged_gradients
self.zero_grad() self.zero_grad(set_to_none=True)
see_memory_usage(f"End ipg_epilogue") see_memory_usage(f"End ipg_epilogue")
# resets all partition to no reduced # resets all partition to no reduced
...@@ -1526,7 +1526,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): ...@@ -1526,7 +1526,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
return params_in_partition, params_not_in_partition, first_offset return params_in_partition, params_not_in_partition, first_offset
def zero_grad(self, set_grads_to_None=True): def zero_grad(self, set_to_none=False):
""" """
Zero FP16 parameter grads. Zero FP16 parameter grads.
""" """
...@@ -1534,7 +1534,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): ...@@ -1534,7 +1534,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# For speed, set model fp16 grad to None by default # For speed, set model fp16 grad to None by default
for group in self.bit16_groups: for group in self.bit16_groups:
for p in group: for p in group:
if set_grads_to_None: if set_to_none:
p.grad = None # epilogue and in step p.grad = None # epilogue and in step
else: else:
if p.grad is not None: if p.grad is not None:
...@@ -1766,7 +1766,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): ...@@ -1766,7 +1766,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
self.loss_scale)) self.loss_scale))
see_memory_usage('After overflow before clearing gradients') see_memory_usage('After overflow before clearing gradients')
self.zero_grad() self.zero_grad(set_to_none=True)
if self.cpu_offload: if self.cpu_offload:
self.reset_cpu_buffers() self.reset_cpu_buffers()
else: else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册