Bfloat16 zero2 (#1398)

* Changes for bfloat16 Zero2 * Cleaned up additional comments and debugging code * Adapted fp16_master_weights_and_grads option to cover BF16 * Reverted fp16_master_weights_and_gradients extension to BFloat16 and minor cleanup * Fixed formatting and variable naming errors recognized in testing * Added relevant unit tests for bfloat16 with ZeRO-2 * Updates conditions for skipping BFloat16 unit tests * Added check for NCCL inconsistent version naming convention * Update skip message for Bfloat16 tests to mention additional checks Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>

Bfloat16 zero2 (#1398)
* Changes for bfloat16 Zero2 * Cleaned up additional comments and debugging code * Adapted fp16_master_weights_and_grads option to cover BF16 * Reverted fp16_master_weights_and_gradients extension to BFloat16 and minor cleanup * Fixed formatting and variable naming errors recognized in testing * Added relevant unit tests for bfloat16 with ZeRO-2 * Updates conditions for skipping BFloat16 unit tests * Added check for NCCL inconsistent version naming convention * Update skip message for Bfloat16 tests to mention additional checks Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>
648f7bfa · Rana Ali Amjad · GitHub · 2c5bba6d · 648f7bfa · 648f7bfa
9 changed file
--- a/csrc/includes/type_shim.h
+++ b/csrc/includes/type_shim.h
@@ -26,6 +26,11 @@
            __VA_ARGS__;                                                         \
            break;                                                               \
        }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
    }

@@ -46,6 +51,11 @@
            __VA_ARGS__;                                                         \
            break;                                                               \
        }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
    }


--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
 '''
 Copyright 2020 The Microsoft DeepSpeed Team
 '''
+
 import sys
 import types
 from typing import Optional, Union
@@ -124,7 +125,6 @@ def initialize(args=None,
        __git_hash__,
        __git_branch__),
             ranks=[0])
-
    assert model is not None, "deepspeed.initialize requires a model"

    if not isinstance(model, PipelineModule):

--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -116,6 +116,15 @@ def get_fp16_enabled(param_dict):
        return False


+def get_bfloat16_enabled(param_dict):
+    if BFLOAT16 in param_dict.keys():
+        return get_scalar_param(param_dict[BFLOAT16],
+                                BFLOAT16_ENABLED,
+                                BFLOAT16_ENABLED_DEFAULT)
+    else:
+        return False
+
+
 def get_fp16_master_weights_and_grads_enabled(param_dict):
    if get_fp16_enabled(param_dict):
        return get_scalar_param(param_dict[FP16],
@@ -130,6 +139,8 @@ def get_loss_scale(param_dict):
        return get_scalar_param(param_dict[FP16],
                                FP16_LOSS_SCALE,
                                FP16_LOSS_SCALE_DEFAULT)
+    elif get_bfloat16_enabled(param_dict):
+        return 1.0
    else:
        return FP16_LOSS_SCALE_DEFAULT

@@ -139,6 +150,8 @@ def get_initial_dynamic_scale(param_dict):
        initial_scale_power = get_scalar_param(param_dict[FP16],
                                               FP16_INITIAL_SCALE_POWER,
                                               FP16_INITIAL_SCALE_POWER_DEFAULT)
+    elif get_bfloat16_enabled(param_dict):
+        initial_scale_power = 0
    else:
        initial_scale_power = FP16_INITIAL_SCALE_POWER_DEFAULT

@@ -791,6 +804,9 @@ class DeepSpeedConfig(object):

        self.gradient_clipping = get_gradient_clipping(param_dict)
        self.fp16_enabled = get_fp16_enabled(param_dict)
+        self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
+        assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
+        assert not (self.bfloat16_enabled and (self.zero_optimization_stage != 2)), 'bfloat16 mode is only enabled for Zero2 currently'
        self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
            param_dict)
        self.amp_enabled = get_amp_enabled(param_dict)
@@ -966,7 +982,7 @@ class DeepSpeedConfig(object):
            assert self.zero_enabled and self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."

    def _do_warning_check(self):
-        fp16_enabled = self.fp16_enabled or self.zero_enabled
+        fp16_enabled = self.fp16_enabled

        vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
        if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:

--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -107,6 +107,22 @@ GRADIENT_ACCUMULATION_STEPS_DEFAULT = None
 SPARSE_GRADIENTS = "sparse_gradients"
 SPARSE_GRADIENTS_DEFAULT = False

+#########################################
+# BFLOAT16 support
+#########################################
+# BFLOAT16 feature. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+BFLOAT16_FORMAT = '''
+BFLOAT16 parameters should be of the format:
+"bfloat16": {
+  "enabled": true
+}
+'''
+BFLOAT16 = "bfloat16"
+
+BFLOAT16_ENABLED = "enabled"
+BFLOAT16_ENABLED_DEFAULT = False
+
 #########################################
 # FP16 support
 #########################################

--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -83,6 +83,7 @@ def split_half_float_double_sparse(tensors):
        "torch.cuda.HalfTensor",
        "torch.cuda.FloatTensor",
        "torch.cuda.DoubleTensor",
+        "torch.cuda.BFloat16Tensor",
        SparseTensor.type()
    ]

@@ -195,7 +196,6 @@ class DeepSpeedEngine(Module):

        # Configure wall clock timer
        self.timers = SynchronizedWallClockTimer()
-
        # Throughput timer
        self.tput_timer = ThroughputTimer(
            batch_size=self.train_micro_batch_size_per_gpu(),
@@ -530,6 +530,9 @@ class DeepSpeedEngine(Module):
    def fp16_enabled(self):
        return self._config.fp16_enabled

+    def bfloat16_enabled(self):
+        return self._config.bfloat16_enabled
+
    def fp16_master_weights_and_gradients(self):
        return self._config.fp16_master_weights_and_gradients

@@ -762,6 +765,8 @@ class DeepSpeedEngine(Module):
                        f"fp16 is enabled but the following parameters have dtype that is not fp16: {', '.join(names)}"
                    )
            self.module.half()
+        elif self.bfloat16_enabled():
+            self.module.bfloat16()
        else:
            if not all(
                [param.dtype == torch.float for param in self.module.parameters()]):
@@ -899,7 +904,7 @@ class DeepSpeedEngine(Module):
                    )
            self.optimizer = self._configure_zero_optimizer(basic_optimizer)
        elif self.amp_enabled():
-            assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode"
+            assert not (self.fp16_enabled() or self.bfloat16_enabled()), "Cannot enable both amp with (legacy) fp16 or bfloat16 mode"
            amp_params = self.amp_params()
            if self.global_rank == 0:
                logger.info(f"Initializing AMP with these params: {amp_params}")
@@ -1537,9 +1542,13 @@ class DeepSpeedEngine(Module):

        # Quantize the updated parameter if there is no overflow
        if self.quantizer:
+            if self.fp16_enabled():
+                tensor_to_quantize = self.optimizer.bit16_groups if self.zero_optimization_stage(
+                ) == 2 else self.optimizer.fp16_groups
+            else:
+                tensor_to_quantize = self.optimizer.param_groups
            self.quantizer.quantize(
-                (self.optimizer.fp16_groups
-                 if self.fp16_enabled() else self.optimizer.param_groups),
+                tensor_to_quantize,
                (self.optimizer.overflow if self.fp16_enabled() else False),
                self.eigenvalue_enabled(),
                block_eigenvalue)
@@ -2261,7 +2270,6 @@ class DeepSpeedEngine(Module):
        method will hang waiting to synchronize with other processes if it's called just for the
        process with rank 0.
        """
-
        if self.zero_optimization_partition_weights():
            # Prepare for state_dict() by ensuring all parameters are partitioned
            self.optimizer.save_checkpoint_prologue()
@@ -2501,22 +2509,23 @@ class DeepSpeedEngine(Module):
        will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
        from the flattened weights.

-        optimizer.fp16_groups seems to be the easiest to use as it's in all zeroX versions.
+        optimizer.bit16_groups seems to be the easiest to use as it's in all zeroX versions.
        """
        param_group_shapes = []
        cnt = 0
        numel = 0

-        # zero2 started using a round_robin_fp16_groups which is a shuffled version of fp16_groups -
+        # zero2 started using a round_robin_bit16_groups which is a shuffled version of bit16_groups -
        # if we don't use it, we get parameters ordered incorrectly
-        if hasattr(self.optimizer, "round_robin_fp16_groups"):
-            fp16_groups = self.optimizer.round_robin_fp16_groups
+        if hasattr(self.optimizer, "round_robin_bit16_groups"):
+            bit16_groups = self.optimizer.round_robin_bit16_groups
        else:
-            fp16_groups = self.optimizer.fp16_groups
+            bit16_groups = self.optimizer.bit16_groups if self.zero_optimization_stage(
+            ) == 2 else self.optimizer.fp16_groups

-        for fp16_group in fp16_groups:
+        for bit16_group in bit16_groups:
            param_shapes = OrderedDict()
-            for param in fp16_group:
+            for param in bit16_group:
                cnt += 1
                numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
                shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape

--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
--- a/tests/unit/test_bf16.py
+++ b/tests/unit/test_bf16.py
+import math
+import torch
+import deepspeed
+import pytest
+from deepspeed.ops.adam import FusedAdam
+from common import distributed_test
+from deepspeed.ops.op_builder import CPUAdamBuilder
+from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+from util import bf16_required_version_check
+
+
+@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
+def test_adam_bf16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "scheduler": {
+            "type": "OneCycle",
+            "params": {
+                "cycle_first_step_size": 16000,
+                "cycle_first_stair_count": 8000,
+                "decay_step_size": 16000,
+                "cycle_min_lr": 1e-06,
+                "cycle_max_lr": 3e-05,
+                "decay_lr_rate": 1e-07,
+                "cycle_min_mom": 0.85,
+                "cycle_max_mom": 0.99,
+                "decay_mom_rate": 0.0
+            }
+        },
+        "fp16": {
+            "enabled": False
+        },
+        "bfloat16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    @distributed_test(world_size=[1])
+    def _test_adam_bf16_zero_onecycle_compatibility(args, zero_stage, hidden_dim):
+        model = SimpleModel(hidden_dim)
+
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_adam_bf16_zero_onecycle_compatibility(args=args,
+                                                zero_stage=zero_stage,
+                                                hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
+def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
+    config_dict = {
+        "train_batch_size": 4,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": False,
+        },
+        "bfloat16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload
+        },
+        "zero_allow_untested_optimizer": False
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    @distributed_test(world_size=[1])
+    def _test_zero_allow_untested_optimizer(args, zero_stage):
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        optimizer = SimpleOptimizer(model.parameters())
+        with pytest.raises(AssertionError):
+            model, optim, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      optimizer=optimizer,
+                                                      model_parameters=model.parameters())
+
+    _test_zero_allow_untested_optimizer(args, zero_stage)
+
+
+@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
+def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
+        pytest.skip("cpu-adam is not compatible")
+
+    if zero_stage == 3:
+        pytest.skip("skip for now")
+
+    config_dict = {
+        "train_micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "fp16": {
+            "enabled": False
+        },
+        "bfloat16": {
+            "enabled": True
+        },
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "zero_optimization": {
+            "stage": zero_stage,
+            "cpu_offload": use_cpu_offload,
+            "reduce_bucket_size": 100,
+            "allgather_bucket_size": 100
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    @distributed_test(world_size=[3])
+    def _test_zero_empty_partition(args, zero_stage):
+        hidden_dim = 1
+        model = SimpleModel(hidden_dim)
+
+        # Ensure model has 2 parameters, to cause empty partition with DP=3
+        assert len(list(model.parameters())) == 2
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        # Now make sure things work..
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_zero_empty_partition(args=args, zero_stage=zero_stage)
+
+
+@pytest.mark.parametrize('zero_stage, optimizer_constructor',
+                         [(2,
+                           torch.optim.Adam),
+                          (2,
+                           FusedAdam)])
+def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": False
+        },
+        "bfloat16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": zero_stage
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    @distributed_test(world_size=[1])
+    def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = optimizer_constructor(params=model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=client_optimizer)
+
+    _test_zero_supported_client_optimizer(args=args,
+                                          zero_stage=zero_stage,
+                                          optimizer_constructor=optimizer_constructor)
+
+
+def test_zero2_reduce_scatter_off(tmpdir):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 2,
+            "contiguous_gradients": True,
+            "allgather_bucket_size": 2000000000,
+            "reduce_bucket_size": 200000000,
+            "overlap_comm": False,
+            "reduce_scatter": False
+        },
+        "fp16": {
+            "enabled": False
+        },
+        "bfloat16": {
+            "enabled": True
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[2])
+    def _helper(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _helper(args=args, model=model, hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('stage', [2])
+def test_zero_empty_grad(tmpdir, stage):
+    if not bf16_required_version_check():
+        pytest.skip(
+            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+        )
+
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "fp16": {
+            "enabled": False
+        },
+        "bfloat16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": stage
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1])
+    def _go(args, model, hidden_dim):
+        optimizer = torch.optim.Adam(model.parameters())
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              optimizer=optimizer)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _go(args=args, model=model, hidden_dim=hidden_dim)
--- a/tests/unit/test_zero.py
+++ b/tests/unit/test_zero.py
@@ -427,7 +427,8 @@ def test_partition_nccl_alignment(tmpdir, zero_stage, world_size):
        # get nccl all-gather send buffers alignment factor
        nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor

-        for data_parallel_partitions in model.optimizer.parallel_partitioned_fp16_groups:
+        parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
+        for data_parallel_partitions in parallel_partitioned_bit16_groups:
            for partition_id, partitioned_data in enumerate(data_parallel_partitions):
                # verify that data partition start locations are 4-byte aligned
                assert (partitioned_data.data_ptr() %

--- a/tests/unit/util.py
+++ b/tests/unit/util.py
 import torch
+from deepspeed.git_version_info import torch_info


 def required_torch_version():
@@ -9,3 +10,23 @@ def required_torch_version():
        return True
    else:
        return False
+
+
+def bf16_required_version_check():
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+    if type(torch.cuda.nccl.version()) != tuple:
+        return False
+    else:
+        NCCL_MAJOR = torch.cuda.nccl.version()[0]
+        NCCL_MINOR = torch.cuda.nccl.version()[1]
+
+    CUDA_MAJOR = int(torch_info['cuda_version'].split('.')[0])
+    if (TORCH_MAJOR > 1 or
+        (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and (
+            NCCL_MAJOR > 2 or
+            (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and torch.cuda.is_bf16_supported():
+        return True
+    else:
+        return False