Reduce Unit Test Time (Part 2) (#3838)

* utilize shorter tests for MII * use cached torch download * rework zero++ unit tests * formatting --------- Co-authored-by: N HeyangQin <heyangqin@microsoft.com>

Reduce Unit Test Time (Part 2) (#3838)
* utilize shorter tests for MII * use cached torch download * rework zero++ unit tests * formatting --------- Co-authored-by: N HeyangQin <heyangqin@microsoft.com>
fd1d2c64 · Michael Wyatt · GitHub · c973e157 · fd1d2c64 · fd1d2c64
14 changed file
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -23,7 +23,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-megatron.yml
+++ b/.github/workflows/nv-megatron.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip3 install -U --cache-dir /blob/torch_cache torch
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"

@@ -54,4 +54,4 @@ jobs:
          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
          cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -20,7 +20,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -24,7 +24,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-torch19-p40.yml
+++ b/.github/workflows/nv-torch19-p40.yml
@@ -20,7 +20,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-torch19-v100.yml
+++ b/.github/workflows/nv-torch19-v100.yml
@@ -20,7 +20,7 @@ jobs:

      - name: Install pytorch
        run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -25,7 +25,7 @@ jobs:
      - name: Install pytorch
        run: |
          # use the same pytorch version as transformers CI
-          pip install --no-cache-dir torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
          python -c "import torch; print('torch:', torch.__version__, torch)"
          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"


--- a/tests/unit/runtime/zero/test_qgzero.py
+++ b/tests/unit/runtime/zero/test_qgzero.py
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qgZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_gradients": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
--- a/tests/unit/runtime/zero/test_qwzero.py
+++ b/tests/unit/runtime/zero/test_qwzero.py
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qwZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2048])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_weights": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
--- a/tests/unit/runtime/zero/test_hpzero.py
+++ b/tests/unit/runtime/zero/test_hpzero.py
@@ -40,12 +40,19 @@ def _assert_no_secondary_tensor_group(model: Module) -> None:
        assert param.ds_zero_param_process_group is None


+def _assert_secondary_tensor_size(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_secondary_tensor is not None
+        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
+
+
 #Large sweep along hidden dim, num_layers, and zpg of different sizes
 #Assert when zpg=1 that secondary group and tensors are invalid
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
+@pytest.mark.sequential
+@pytest.mark.parametrize("h_dim", [1024])
+@pytest.mark.parametrize("n_layers", [4, 9])
 @pytest.mark.parametrize("zpg", [1, 2, 4])
-class TesthpZeroConfigSweep(DistributedTest):
+class TestZeroPPConfigSweep(DistributedTest):
    world_size = 4

    def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
@@ -55,6 +62,8 @@ class TesthpZeroConfigSweep(DistributedTest):
                "stage": 3,
                "stage3_max_reuse_distance": 0,
                "zero_hpz_partition_size": zpg,
+                "zero_quantized_weights": True,
+                "zero_quantized_gradients": True,
                "contiguous_gradients": True,
                "overlap_comm": True,
            },
@@ -78,53 +87,8 @@ class TesthpZeroConfigSweep(DistributedTest):
            _assert_no_secondary_tensor_group(model)

        for n, batch in enumerate(data_loader):
+            if n == 0 and zpg != 1:
+                _assert_secondary_tensor_size(model)
            loss = model(batch[0], batch[1])
            model.backward(loss)
            model.step()
-
-
-def _assert_secondary_tensor_size(model: Module) -> None:
-    for _, param in model.named_parameters():
-        assert param.ds_secondary_tensor is not None
-        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
-
-
-#Tests that secondary tensors are available and are of right sizes
-@pytest.mark.parametrize("h_dim", [1024, 4000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-@pytest.mark.parametrize("zpg", [2, 4])
-class TestSecondaryTensorSize(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "zero_hpz_partition_size": zpg,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=4, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            _assert_secondary_tensor_size(model)
-            if n == 0: break