diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index fcc2580b2cc3affb60c75bd569dc76b8f7a28442..9147df3c9618931efe4642f2b1cf2d57d250af2f 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -15,6 +15,7 @@ import collections from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank, get_global_norm, see_memory_usage, is_model_parallel_parameter from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS +from deepspeed.runtime.zero.offload_constants import OFFLOAD_CPU_DEVICE, OFFLOAD_OPTIMIZER, OFFLOAD_OPTIMIZER_DEVICE from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.ops.op_builder import UtilsBuilder from deepspeed.utils import logger @@ -2242,7 +2243,8 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params, """ def format_options(cpu_offload): enabled = [] - enabled.append(f"cpu_offload={1 if cpu_offload else 0}") + device = f'{OFFLOAD_CPU_DEVICE:4}' if cpu_offload else "none" + enabled.append(f"{OFFLOAD_OPTIMIZER}={device}") return ", ".join(enabled) nodes_str = "nodes" if num_nodes > 1 else "node" diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 852b619ab3d6345ddbf75a955eb24c3806911d3f..c306114d927b4133bd0fbd37ed08e203ec37f95c 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -3431,8 +3431,11 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params, """ def format_options(cpu_offload, cpu_offload_params, zero_init): enabled = [] - enabled.append(f"cpu_offload={1 if cpu_offload else 0}") - enabled.append(f"cpu_offload_params={1 if cpu_offload_params else 0}") + padded_cpu_str = f'{OFFLOAD_CPU_DEVICE:4}' + param_device = padded_cpu_str if cpu_offload_params else "none" + enabled.append(f"{OFFLOAD_PARAM}={param_device}") + optimizer_device = padded_cpu_str if cpu_offload else "none" + enabled.append(f"{OFFLOAD_OPTIMIZER}={optimizer_device}") enabled.append(f"zero_init={1 if zero_init else 0}") return ", ".join(enabled) diff --git a/docs/code-docs/source/memory.rst b/docs/code-docs/source/memory.rst index 892ef8f1cac37d5219a16fc9a4feffd673e29a7e..5c92dc199aa4de6002adb6f45a78c01881673ea7 100644 --- a/docs/code-docs/source/memory.rst +++ b/docs/code-docs/source/memory.rst @@ -128,19 +128,19 @@ The big question is how big of a model you can fit on the hardware you have? Or * ZeRO-2: - - ``"cpu_offload": true``: 2 * params + - ``"offload_optimizer": {"device": "cpu"}``: 2 * params Example: a 40GB GPU can fit ~11B param model (regardless of how many GPUs are used). Here the model is loaded in ``fp16`` so just the model weights take about 22GB and the remaining 18GB are used by other components. You can barely fit a very small batch size in this scenario. - - ``"cpu_offload": false``: 4 params + 16 params/ (total number of gpus) + - ``"offload_optimizer": {"device": "none"}``: 4 * params + 16 * params/ (total number of gpus) * ZeRO-3: ``largest_layer_memory = 4*largest_layer_params`` - GPU memory needed to gather the largest layer on a single GPU. 2 bytes fp16 params are gathered and 2 bytes fp16 grads are computed (total 4x). The optimizer states and fp32 parameters are updated in partitioned form and copied to fp16 params in partitioned form. This happens during the optimizer step. After that the fp16 params are sufficient. - - case 1: ``"cpu_offload": false, "cpu_offload_params": false`` - largest_layer_memory + 18 * params / total number of gpus across all nodes - - case 2: ``"cpu_offload": true, "cpu_offload_params": true``- largest_layer_memory. The main limit here is general RAM. - - case 3: ``"cpu_offload": true, "cpu_offload_params": false``- largest_layer_memory + 2 * params / total number of gpus across all nodes + - case 1: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "none"}`` - largest_layer_memory + 18 * params / total number of gpus across all nodes + - case 2: ``"offload_param": {"device": "cpu"}, "offload_optimizer": {"device": "cpu"}``- largest_layer_memory. The main limit here is general RAM. + - case 3: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "cpu"}``- largest_layer_memory + 2 * params / total number of gpus across all nodes Example: @@ -194,11 +194,11 @@ In the following calculations we will use: * ZeRO-2: - - ``"cpu_offload": false``: + - ``"offload_optimizer": {"device": "none"}``: params * 4 * n_gpus * additional_buffer_factor - this is the memory needed only at the beginning to initialize the model on CPU memory - - ``"cpu_offload": true``: + - ``"offload_optimizer": {"device": "cpu"}``: params * max(4 * n_gpus, 16) * additional_buffer_factor @@ -208,7 +208,7 @@ In the following calculations we will use: gpus_factor = n_gpus / total_gpus - - case 1: ``"cpu_offload": false``: + - case 1: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "none"}``: Without ``zero.Init``: @@ -222,7 +222,7 @@ In the following calculations we will use: assuming Pytorch is deallocating the memory once the tensors are moved to the GPU by ZeRO.Init - - case 2: ``"cpu_offload": true, cpu_offload_params true``: + - case 2: ``"offload_param": {"device": "cpu"}, "offload_optimizer": {"device": "cpu"}``: Without ``zero.Init``: @@ -232,7 +232,7 @@ In the following calculations we will use: params * 18 * gpus_factor * additional_buffer_factor - - case 3: ``"cpu_offload": true, cpu_offload_params false``: + - case 3: ``"offload_param": {"device": "none"}, "offload_optimizer": {"device": "cpu"}``: Without ``zero.Init``: