From fd1d2c64472c1a3061a05eb3b56a3f882199cfba Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 29 Jun 2023 13:54:49 -0700 Subject: [PATCH] Reduce Unit Test Time (Part 2) (#3838) * utilize shorter tests for MII * use cached torch download * rework zero++ unit tests * formatting --------- Co-authored-by: HeyangQin --- .github/workflows/amd-mi200.yml | 2 +- .github/workflows/nv-accelerate-v100.yml | 2 +- .github/workflows/nv-inference.yml | 2 +- .github/workflows/nv-lightning-v100.yml | 2 +- .github/workflows/nv-megatron.yml | 2 +- .github/workflows/nv-mii.yml | 4 +- .github/workflows/nv-nightly.yml | 2 +- .github/workflows/nv-torch-latest-v100.yml | 2 +- .github/workflows/nv-torch19-p40.yml | 2 +- .github/workflows/nv-torch19-v100.yml | 2 +- .github/workflows/nv-transformers-v100.yml | 2 +- tests/unit/runtime/zero/test_qgzero.py | 61 ------------------ tests/unit/runtime/zero/test_qwzero.py | 61 ------------------ .../zero/{test_hpzero.py => test_zeropp.py} | 64 ++++--------------- 14 files changed, 26 insertions(+), 184 deletions(-) delete mode 100644 tests/unit/runtime/zero/test_qgzero.py delete mode 100644 tests/unit/runtime/zero/test_qwzero.py rename tests/unit/runtime/zero/{test_hpzero.py => test_zeropp.py} (63%) diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index 34f9907c..c8e1192d 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -23,7 +23,7 @@ jobs: - name: Install pytorch run: | - pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 8e8566d7..6528a01b 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index b8f19b5a..c5c60751 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index 983598e3..0aca9d3d 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml index c88c87cf..ef2ef269 100644 --- a/.github/workflows/nv-megatron.yml +++ b/.github/workflows/nv-megatron.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index f8bf58c3..b4ba8cef 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip3 install -U --cache-dir /blob/torch_cache torch python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -54,4 +54,4 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi cd tests - TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./ + TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./ diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index d7adb107..309514e7 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 56e6b48d..91493a78 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -24,7 +24,7 @@ jobs: - name: Install pytorch run: | - pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml index 064e7774..6ae41651 100644 --- a/.github/workflows/nv-torch19-p40.yml +++ b/.github/workflows/nv-torch19-p40.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml index 615685c1..fa4d58d1 100644 --- a/.github/workflows/nv-torch19-v100.yml +++ b/.github/workflows/nv-torch19-v100.yml @@ -20,7 +20,7 @@ jobs: - name: Install pytorch run: | - pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index e433e44b..83492828 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -25,7 +25,7 @@ jobs: - name: Install pytorch run: | # use the same pytorch version as transformers CI - pip install --no-cache-dir torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html + pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/tests/unit/runtime/zero/test_qgzero.py b/tests/unit/runtime/zero/test_qgzero.py deleted file mode 100644 index ccd0f166..00000000 --- a/tests/unit/runtime/zero/test_qgzero.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -import pytest -import deepspeed.comm as dist -from unit.common import DistributedTest -from unit.simple_model import random_dataloader - -import deepspeed -import torch.nn as nn - - -class NNModel(nn.Module): - - def __init__(self, h_dim=1024, n_layers=2): - super(NNModel, self).__init__() - self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)]) - self.cross_entropy_loss = nn.CrossEntropyLoss() - - def forward(self, x, y): - for layer in self.layers: - x = layer(x) - return self.cross_entropy_loss(x, y) - - -#Large sweep along hidden dim, num_layers of different sizes for qgZeRO. -@pytest.mark.parametrize("h_dim", [1024, 2000]) -@pytest.mark.parametrize("n_layers", [8, 20]) -class TesthpZeroConfigSweep(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "reduce_scatter": True, - "zero_quantized_gradients": True - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() diff --git a/tests/unit/runtime/zero/test_qwzero.py b/tests/unit/runtime/zero/test_qwzero.py deleted file mode 100644 index 71a0914e..00000000 --- a/tests/unit/runtime/zero/test_qwzero.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -import pytest -import deepspeed.comm as dist -from unit.common import DistributedTest -from unit.simple_model import random_dataloader - -import deepspeed -import torch.nn as nn - - -class NNModel(nn.Module): - - def __init__(self, h_dim=1024, n_layers=2): - super(NNModel, self).__init__() - self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)]) - self.cross_entropy_loss = nn.CrossEntropyLoss() - - def forward(self, x, y): - for layer in self.layers: - x = layer(x) - return self.cross_entropy_loss(x, y) - - -#Large sweep along hidden dim, num_layers of different sizes for qwZeRO. -@pytest.mark.parametrize("h_dim", [1024, 2048]) -@pytest.mark.parametrize("n_layers", [8, 20]) -class TesthpZeroConfigSweep(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "reduce_scatter": True, - "zero_quantized_weights": True - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - model.step() diff --git a/tests/unit/runtime/zero/test_hpzero.py b/tests/unit/runtime/zero/test_zeropp.py similarity index 63% rename from tests/unit/runtime/zero/test_hpzero.py rename to tests/unit/runtime/zero/test_zeropp.py index 1d61d3c5..27ec7269 100644 --- a/tests/unit/runtime/zero/test_hpzero.py +++ b/tests/unit/runtime/zero/test_zeropp.py @@ -40,12 +40,19 @@ def _assert_no_secondary_tensor_group(model: Module) -> None: assert param.ds_zero_param_process_group is None +def _assert_secondary_tensor_size(model: Module) -> None: + for _, param in model.named_parameters(): + assert param.ds_secondary_tensor is not None + assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0 + + #Large sweep along hidden dim, num_layers, and zpg of different sizes #Assert when zpg=1 that secondary group and tensors are invalid -@pytest.mark.parametrize("h_dim", [1024, 2000]) -@pytest.mark.parametrize("n_layers", [8, 20]) +@pytest.mark.sequential +@pytest.mark.parametrize("h_dim", [1024]) +@pytest.mark.parametrize("n_layers", [4, 9]) @pytest.mark.parametrize("zpg", [1, 2, 4]) -class TesthpZeroConfigSweep(DistributedTest): +class TestZeroPPConfigSweep(DistributedTest): world_size = 4 def test(self, h_dim: int, n_layers: int, zpg: int) -> None: @@ -55,6 +62,8 @@ class TesthpZeroConfigSweep(DistributedTest): "stage": 3, "stage3_max_reuse_distance": 0, "zero_hpz_partition_size": zpg, + "zero_quantized_weights": True, + "zero_quantized_gradients": True, "contiguous_gradients": True, "overlap_comm": True, }, @@ -78,53 +87,8 @@ class TesthpZeroConfigSweep(DistributedTest): _assert_no_secondary_tensor_group(model) for n, batch in enumerate(data_loader): + if n == 0 and zpg != 1: + _assert_secondary_tensor_size(model) loss = model(batch[0], batch[1]) model.backward(loss) model.step() - - -def _assert_secondary_tensor_size(model: Module) -> None: - for _, param in model.named_parameters(): - assert param.ds_secondary_tensor is not None - assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0 - - -#Tests that secondary tensors are available and are of right sizes -@pytest.mark.parametrize("h_dim", [1024, 4000]) -@pytest.mark.parametrize("n_layers", [8, 20]) -@pytest.mark.parametrize("zpg", [2, 4]) -class TestSecondaryTensorSize(DistributedTest): - world_size = 4 - - def test(self, h_dim: int, n_layers: int, zpg: int) -> None: - config_dict = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 3, - "stage3_max_reuse_distance": 0, - "zero_hpz_partition_size": zpg, - "contiguous_gradients": True, - "overlap_comm": True, - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": 1. - } - }, - "fp16": { - "enabled": True, - "loss_scale": 1., - } - } - - model = NNModel(h_dim, n_layers) - model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict) - data_loader = random_dataloader(model=model, total_samples=4, hidden_dim=h_dim, device=model.device) - dist.barrier() - - for n, batch in enumerate(data_loader): - loss = model(batch[0], batch[1]) - model.backward(loss) - _assert_secondary_tensor_size(model) - if n == 0: break -- GitLab