call empty_cache to really free up GPU memory as described in comment (#2620)

Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>

call empty_cache to really free up GPU memory as described in comment (#2620)
Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>
d0dbc95a · 郭叶军 · GitHub · b08cf416 · d0dbc95a · d0dbc95a
隐藏空白更改
内联并排

Showing with 6 addition and 0 deletion

deepspeed/runtime/utils.py deepspeed/runtime/utils.py +4 -0

deepspeed/runtime/zero/stage_1_and_2.py deepspeed/runtime/zero/stage_1_and_2.py +2 -0

未找到文件。
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -814,6 +814,10 @@ def get_ma_status():
    return torch.cuda.memory_allocated()


+def empty_cache():
+    torch.cuda.empty_cache()
+
+
 def see_memory_usage(message, force=False):
    if not force:
        return

--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -13,6 +13,7 @@ from deepspeed.runtime import ZeROOptimizer
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank,
                                     get_global_norm,
+                                     empty_cache,
                                     see_memory_usage,
                                     is_model_parallel_parameter,
                                     align_dense_tensors,
@@ -295,6 +296,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
            see_memory_usage(f"Before moving param group {i} to CPU")
            # move all the parameters to cpu to free up GPU space for creating flat buffer
            move_to_cpu(self.bit16_groups[i])
+            empty_cache()
            see_memory_usage(f"After moving param group {i} to CPU", force=False)

            # Reorder group parameters for load balancing of gradient partitioning during backward among ranks.