未验证 提交 34a11688 编写于 作者: L loadams 提交者: GitHub

Change zero_grad() argument to match pytorch (#2741)

上级 867da307
......@@ -130,14 +130,14 @@ class FP16_Optimizer(DeepSpeedOptimizer):
return
def zero_grad(self, set_grads_to_None=True):
def zero_grad(self, set_to_none=False):
"""
Zero FP16 parameter grads.
"""
# For speed, set model fp16 grad to None by default
for group in self.fp16_groups:
for p in group:
if set_grads_to_None:
if set_to_none:
p.grad = None
else:
if p.grad is not None:
......
......@@ -110,7 +110,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
self.initialize_optimizer_states()
def zero_grad(self, set_grads_to_None=True):
def zero_grad(self, set_to_none=False):
"""
Zero FP16 parameter grads.
"""
......@@ -118,7 +118,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
# For speed, set model fp16 grad to None by default
for group in self.fp16_groups:
for p in group:
if set_grads_to_None:
if set_to_none:
p.grad = None
else:
if p.grad is not None:
......
......@@ -1509,7 +1509,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
return params_in_partition, params_not_in_partition, first_offset
@instrument_w_nvtx
def zero_grad(self, set_grads_to_None=True):
def zero_grad(self, set_to_none=False):
"""
Zero FP16 parameter grads.
"""
......@@ -1519,7 +1519,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
# For speed, set model fp16 grad to None by default
for group in self.fp16_groups:
for p in group:
if set_grads_to_None:
if set_to_none:
if p.grad is not None and p.grad.is_cuda:
p.grad.record_stream(torch.cuda.current_stream())
p.grad = None
......@@ -1708,7 +1708,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition
# release all the gradient since we have already created a necessary copy in dp_grad_partition
self.zero_grad()
self.zero_grad(set_to_none=True)
for grad in filter(lambda g: g.is_cuda, self.averaged_gradients[sub_group_id]):
grad.record_stream(torch.cuda.current_stream())
......@@ -1816,7 +1816,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
def _overflow_clean_up(self, prev_scale):
see_memory_usage('After overflow before clearing gradients', force=False)
self.zero_grad()
self.zero_grad(set_to_none=True)
if self.offload_optimizer:
self.reset_cpu_buffers()
......
......@@ -753,7 +753,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# No need to keep the gradients anymore.
# All gradients required by the step
# are in self.averaged_gradients
self.zero_grad()
self.zero_grad(set_to_none=True)
see_memory_usage(f"End ipg_epilogue")
# resets all partition to no reduced
......@@ -1526,7 +1526,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
return params_in_partition, params_not_in_partition, first_offset
def zero_grad(self, set_grads_to_None=True):
def zero_grad(self, set_to_none=False):
"""
Zero FP16 parameter grads.
"""
......@@ -1534,7 +1534,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# For speed, set model fp16 grad to None by default
for group in self.bit16_groups:
for p in group:
if set_grads_to_None:
if set_to_none:
p.grad = None # epilogue and in step
else:
if p.grad is not None:
......@@ -1766,7 +1766,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
self.loss_scale))
see_memory_usage('After overflow before clearing gradients')
self.zero_grad()
self.zero_grad(set_to_none=True)
if self.cpu_offload:
self.reset_cpu_buffers()
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册