From 01d17492c360753670662ae6cac65e0c35e70734 Mon Sep 17 00:00:00 2001 From: hablb <93904671+hablb@users.noreply.github.com> Date: Wed, 26 Apr 2023 18:16:25 +0300 Subject: [PATCH] Fix memory leak in zero2 contiguous gradients (#3306) No usage of extra_large_param_to_reduce if contiguous_gradients is False. It keeps reference of the param for the lifetime of the application. Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/zero/stage_1_and_2.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index d84c9b98..4fe4fedc 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -839,14 +839,14 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer): Gradient computed twice for this partition. \ Multiple gradient reduction is currently not supported" - if param.numel() > self.reduce_bucket_size: - self.extra_large_param_to_reduce = param - - elif self.contiguous_gradients: - # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening - new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel()) - new_grad_tensor.copy_(param.grad.view(-1)) - param.grad.data = new_grad_tensor.data.view_as(param.grad) + if self.contiguous_gradients: + if param.numel() > self.reduce_bucket_size: + self.extra_large_param_to_reduce = param + else: + # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening + new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel()) + new_grad_tensor.copy_(param.grad.view(-1)) + param.grad.data = new_grad_tensor.data.view_as(param.grad) self.elements_in_ipg_bucket += param.numel() -- GitLab