From 7567c76c05626c5acd8b5700bedfc412c55d5354 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 13 Nov 2021 09:38:51 -0800 Subject: [PATCH] Update offload parameter names (#1536) Co-authored-by: Jeff Rasley --- deepspeed/runtime/zero/stage2.py | 4 +++- deepspeed/runtime/zero/stage3.py | 7 +++++-- docs/code-docs/source/memory.rst | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index fcc2580b..9147df3c 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -15,6 +15,7 @@ import collections from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank, get_global_norm, see_memory_usage, is_model_parallel_parameter from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS +from deepspeed.runtime.zero.offload_constants import OFFLOAD_CPU_DEVICE, OFFLOAD_OPTIMIZER, OFFLOAD_OPTIMIZER_DEVICE from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.ops.op_builder import UtilsBuilder from deepspeed.utils import logger @@ -2242,7 +2243,8 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params, """ def format_options(cpu_offload): enabled = [] - enabled.append(f"cpu_offload={1 if cpu_offload else 0}") + device = f'{OFFLOAD_CPU_DEVICE:4}' if cpu_offload else "none" + enabled.append(f"{OFFLOAD_OPTIMIZER}={device}") return ", ".join(enabled) nodes_str = "nodes" if num_nodes > 1 else "node" diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 852b619a..c306114d 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -3431,8 +3431,11 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params, """ def format_options(cpu_offload, cpu_offload_params, zero_init): enabled = [] - enabled.append(f"cpu_offload={1 if cpu_offload else 0}") - enabled.append(f"cpu_offload_params={1 if cpu_offload_params else 0}") + padded_cpu_str = f'{OFFLOAD_CPU_DEVICE:4}' + param_device = padded_cpu_str if cpu_offload_params else "none" + enabled.append(f"{OFFLOAD_PARAM}={param_device}") + optimizer_device = padded_cpu_str if cpu_offload else "none" + enabled.append(f"{OFFLOAD_OPTIMIZER}={optimizer_device}") enabled.append(f"zero_init={1 if zero_init else 0}") return ", ".join(enabled) diff --git a/docs/code-docs/source/memory.rst b/docs/code-docs/source/memory.rst index 892ef8f1..5c92dc19 100644 --- a/docs/code-docs/source/memory.rst +++ b/docs/code-docs/source/memory.rst @@ -128,19 +128,19 @@ The big question is how big of a model you can fit on the hardware you have? Or * ZeRO-2: - - ``"cpu_offload": true``: 2 * params + - ``"offload_optimizer": {"device": "cpu"}``: 2 * params Example: a 40GB GPU can fit ~11B param model (regardless of how many GPUs are used). Here the model is loaded in ``fp16`` so just the model weights take about 22GB and the remaining 18GB are used by other components. You can barely fit a very small batch size in this scenario. - - ``"cpu_offload": false``: 4 params + 16 params/ (total number of gpus) + - ``"offload_optimizer": {"device": "none"}``: 4 * params + 16 * params/ (total number of gpus) * ZeRO-3: ``largest_layer_memory = 4*largest_layer_params`` - GPU memory needed to gather the largest layer on a single GPU. 2 bytes fp16 params are gathered and 2 bytes fp16 grads are computed (total 4x). The optimizer states and fp32 parameters are updated in partitioned form and copied to fp16 params in partitioned form. This happens during the optimizer step. After that the fp16 params are sufficient. - - case 1: ``"cpu_offload": false, "cpu_offload_params": false`` - largest_layer_memory + 18 * params / total number of gpus across all nodes - - case 2: ``"cpu_offload": true, "cpu_offload_params": true``- largest_layer_memory. The main limit here is general RAM. - - case 3: ``"cpu_offload": true, "cpu_offload_params": false``- largest_layer_memory + 2 * params / total number of gpus across all nodes + - case 1: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "none"}`` - largest_layer_memory + 18 * params / total number of gpus across all nodes + - case 2: ``"offload_param": {"device": "cpu"}, "offload_optimizer": {"device": "cpu"}``- largest_layer_memory. The main limit here is general RAM. + - case 3: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "cpu"}``- largest_layer_memory + 2 * params / total number of gpus across all nodes Example: @@ -194,11 +194,11 @@ In the following calculations we will use: * ZeRO-2: - - ``"cpu_offload": false``: + - ``"offload_optimizer": {"device": "none"}``: params * 4 * n_gpus * additional_buffer_factor - this is the memory needed only at the beginning to initialize the model on CPU memory - - ``"cpu_offload": true``: + - ``"offload_optimizer": {"device": "cpu"}``: params * max(4 * n_gpus, 16) * additional_buffer_factor @@ -208,7 +208,7 @@ In the following calculations we will use: gpus_factor = n_gpus / total_gpus - - case 1: ``"cpu_offload": false``: + - case 1: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "none"}``: Without ``zero.Init``: @@ -222,7 +222,7 @@ In the following calculations we will use: assuming Pytorch is deallocating the memory once the tensors are moved to the GPU by ZeRO.Init - - case 2: ``"cpu_offload": true, cpu_offload_params true``: + - case 2: ``"offload_param": {"device": "cpu"}, "offload_optimizer": {"device": "cpu"}``: Without ``zero.Init``: @@ -232,7 +232,7 @@ In the following calculations we will use: params * 18 * gpus_factor * additional_buffer_factor - - case 3: ``"cpu_offload": true, cpu_offload_params false``: + - case 3: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "cpu"}``: Without ``zero.Init``: -- GitLab