diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index 34f9907c9aac7c77263346647f22e37d78779e61..c8e1192d1d560e23bae5f4637866c197ea65231c 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -23,7 +23,7 @@ jobs: - name: Install pytorch run: | - pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 8e8566d79669cc19af0c9122d38c13ba60cab08b..6528a01bc60fd1c1b97354b2d2e437a589026403 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index b8f19b5a11853b7112857ccf524dde3ab1d19162..c5c60751fb76a697411d477f227c6c11d2a839ad 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index 983598e3f6d384b12e0c3a696787f65c46bf215c..0aca9d3d716a6cfb24fcd474fca7e15c49ba5fe2 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml index c88c87cf913940f34180878214210fb5003dcca9..ef2ef269be15bc14303180f86038bfe1bf7f644d 100644 --- a/.github/workflows/nv-megatron.yml +++ b/.github/workflows/nv-megatron.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index f8bf58c32bc838b2d96a89a32908e4552965fd16..b4ba8cef274d451a1c626cf1c504e554bcb600ac 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip3 install -U --cache-dir /blob/torch_cache torch python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -54,4 +54,4 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./ + TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./ diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index d7adb107e46153136699bacd7efeefdd53472b5c..309514e7eb4cc89db00c6cb0e13b76798672f326 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 56e6b48d766ea4c2f3a1b699ff3fb1dec52d1331..91493a7866edd8c83b9025f7812ceb54b076a46a 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml index 064e77742f49bec53e21ac9210d9dc9379872162..6ae41651e9d2e0e03263601bfc899e04dc1435c3 100644 --- a/.github/workflows/nv-torch19-p40.yml +++ b/.github/workflows/nv-torch19-p40.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml index 615685c1546b4a81fbfb34e408ed9243f5293e00..fa4d58d13f7ed936e4bbc2ec35607f885ee20e7a 100644 --- a/.github/workflows/nv-torch19-v100.yml +++ b/.github/workflows/nv-torch19-v100.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index e433e44bdefd711530da63e755c3993a668b1ba9..834928288393316ea8ba5163276534fd9d23b576 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -25,7 +25,7 @@ jobs: - name: Install pytorch run: | # use the same pytorch version as transformers CI - pip install --no-cache-dir torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/tests/unit/runtime/zero/test_qgzero.py b/tests/unit/runtime/zero/test_qgzero.py deleted file mode 100644 index ccd0f166d305f557a5415fc17728f68455eb42e7..0000000000000000000000000000000000000000 --- a/tests/unit/runtime/zero/test_qgzero.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -import pytest -import deepspeed.comm as dist -from unit.common import DistributedTest -from unit.simple_model import random_dataloader - -import deepspeed -import torch.nn as nn - - -class NNModel(nn.Module): - - def __init__(self, h_dim=1024, n_layers=2): - super(NNModel, self).__init__() - self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)]) - self.cross_entropy_loss = nn.CrossEntropyLoss() - - def forward(self, x, y): - for layer in self.layers: - x = layer(x) - return self.cross_entropy_loss(x, y) - - -#Large sweep along hidden dim, num_layers of different sizes for qgZeRO. -@pytest.mark.parametrize("h_dim", [1024, 2000]) -@pytest.mark.parametrize("n_layers", [8, 20]) -class TesthpZeroConfigSweep(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "reduce_scatter": True, - "zero_quantized_gradients": True - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() diff --git a/tests/unit/runtime/zero/test_qwzero.py b/tests/unit/runtime/zero/test_qwzero.py deleted file mode 100644 index 71a0914e1a567e1f5477ca293d872bf1909d22bb..0000000000000000000000000000000000000000 --- a/tests/unit/runtime/zero/test_qwzero.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -import pytest -import deepspeed.comm as dist -from unit.common import DistributedTest -from unit.simple_model import random_dataloader - -import deepspeed -import torch.nn as nn - - -class NNModel(nn.Module): - - def __init__(self, h_dim=1024, n_layers=2): - super(NNModel, self).__init__() - self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)]) - self.cross_entropy_loss = nn.CrossEntropyLoss() - - def forward(self, x, y): - for layer in self.layers: - x = layer(x) - return self.cross_entropy_loss(x, y) - - -#Large sweep along hidden dim, num_layers of different sizes for qwZeRO. -@pytest.mark.parametrize("h_dim", [1024, 2048]) -@pytest.mark.parametrize("n_layers", [8, 20]) -class TesthpZeroConfigSweep(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "reduce_scatter": True, - "zero_quantized_weights": True - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() diff --git a/tests/unit/runtime/zero/test_hpzero.py b/tests/unit/runtime/zero/test_zeropp.py similarity index 63% rename from tests/unit/runtime/zero/test_hpzero.py rename to tests/unit/runtime/zero/test_zeropp.py index 1d61d3c50a104adfcbc7d97b19fe42dda620a7eb..27ec7269afc6939366ed5eb16f610a1de8b3e96e 100644 --- a/tests/unit/runtime/zero/test_hpzero.py +++ b/tests/unit/runtime/zero/test_zeropp.py @@ -40,12 +40,19 @@ def _assert_no_secondary_tensor_group(model: Module) -> None: assert param.ds_zero_param_process_group is None +def _assert_secondary_tensor_size(model: Module) -> None: + for _, param in model.named_parameters(): + assert param.ds_secondary_tensor is not None + assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0 + + #Large sweep along hidden dim, num_layers, and zpg of different sizes #Assert when zpg=1 that secondary group and tensors are invalid -@pytest.mark.parametrize("h_dim", [1024, 2000]) -@pytest.mark.parametrize("n_layers", [8, 20]) +@pytest.mark.sequential +@pytest.mark.parametrize("h_dim", [1024]) +@pytest.mark.parametrize("n_layers", [4, 9]) @pytest.mark.parametrize("zpg", [1, 2, 4]) -class TesthpZeroConfigSweep(DistributedTest): +class TestZeroPPConfigSweep(DistributedTest): world_size = 4 def test(self, h_dim: int, n_layers: int, zpg: int) -> None: @@ -55,6 +62,8 @@ class TesthpZeroConfigSweep(DistributedTest): "stage": 3, "stage3_max_reuse_distance": 0, "zero_hpz_partition_size": zpg, + "zero_quantized_weights": True, + "zero_quantized_gradients": True, "contiguous_gradients": True, "overlap_comm": True, }, @@ -78,53 +87,8 @@ class TesthpZeroConfigSweep(DistributedTest): _assert_no_secondary_tensor_group(model) for n, batch in enumerate(data_loader): + if n == 0 and zpg != 1: + _assert_secondary_tensor_size(model) loss = model(batch[0], batch[1]) model.backward(loss) model.step() - - -def _assert_secondary_tensor_size(model: Module) -> None: - for _, param in model.named_parameters(): - assert param.ds_secondary_tensor is not None - assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0 - - -#Tests that secondary tensors are available and are of right sizes -@pytest.mark.parametrize("h_dim", [1024, 4000]) -@pytest.mark.parametrize("n_layers", [8, 20]) -@pytest.mark.parametrize("zpg", [2, 4]) -class TestSecondaryTensorSize(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int, zpg: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "stage3_max_reuse_distance": 0, - "zero_hpz_partition_size": zpg, - "contiguous_gradients": True, - "overlap_comm": True, - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=4, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - _assert_secondary_tensor_size(model) - if n == 0: break