From fd1d2c64472c1a3061a05eb3b56a3f882199cfba Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 29 Jun 2023 13:54:49 -0700
Subject: [PATCH] Reduce Unit Test Time (Part 2) (#3838)

* utilize shorter tests for MII

* use cached torch download

* rework zero++ unit tests

* formatting

---------

Co-authored-by: HeyangQin <heyangqin@microsoft.com>
---
 .github/workflows/amd-mi200.yml               |  2 +-
 .github/workflows/nv-accelerate-v100.yml      |  2 +-
 .github/workflows/nv-inference.yml            |  2 +-
 .github/workflows/nv-lightning-v100.yml       |  2 +-
 .github/workflows/nv-megatron.yml             |  2 +-
 .github/workflows/nv-mii.yml                  |  4 +-
 .github/workflows/nv-nightly.yml              |  2 +-
 .github/workflows/nv-torch-latest-v100.yml    |  2 +-
 .github/workflows/nv-torch19-p40.yml          |  2 +-
 .github/workflows/nv-torch19-v100.yml         |  2 +-
 .github/workflows/nv-transformers-v100.yml    |  2 +-
 tests/unit/runtime/zero/test_qgzero.py        | 61 ------------------
 tests/unit/runtime/zero/test_qwzero.py        | 61 ------------------
 .../zero/{test_hpzero.py => test_zeropp.py}   | 64 ++++---------------
 14 files changed, 26 insertions(+), 184 deletions(-)
 delete mode 100644 tests/unit/runtime/zero/test_qgzero.py
 delete mode 100644 tests/unit/runtime/zero/test_qwzero.py
 rename tests/unit/runtime/zero/{test_hpzero.py => test_zeropp.py} (63%)

diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index 34f9907c..c8e1192d 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 8e8566d7..6528a01b 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index b8f19b5a..c5c60751 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 983598e3..0aca9d3d 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
index c88c87cf..ef2ef269 100644
--- a/.github/workflows/nv-megatron.yml
+++ b/.github/workflows/nv-megatron.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index f8bf58c3..b4ba8cef 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip3 install -U --cache-dir /blob/torch_cache torch
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -54,4 +54,4 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index d7adb107..309514e7 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 56e6b48d..91493a78 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml
index 064e7774..6ae41651 100644
--- a/.github/workflows/nv-torch19-p40.yml
+++ b/.github/workflows/nv-torch19-p40.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml
index 615685c1..fa4d58d1 100644
--- a/.github/workflows/nv-torch19-v100.yml
+++ b/.github/workflows/nv-torch19-v100.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index e433e44b..83492828 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install --no-cache-dir torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/tests/unit/runtime/zero/test_qgzero.py b/tests/unit/runtime/zero/test_qgzero.py
deleted file mode 100644
index ccd0f166..00000000
--- a/tests/unit/runtime/zero/test_qgzero.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qgZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_gradients": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
diff --git a/tests/unit/runtime/zero/test_qwzero.py b/tests/unit/runtime/zero/test_qwzero.py
deleted file mode 100644
index 71a0914e..00000000
--- a/tests/unit/runtime/zero/test_qwzero.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qwZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2048])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_weights": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
diff --git a/tests/unit/runtime/zero/test_hpzero.py b/tests/unit/runtime/zero/test_zeropp.py
similarity index 63%
rename from tests/unit/runtime/zero/test_hpzero.py
rename to tests/unit/runtime/zero/test_zeropp.py
index 1d61d3c5..27ec7269 100644
--- a/tests/unit/runtime/zero/test_hpzero.py
+++ b/tests/unit/runtime/zero/test_zeropp.py
@@ -40,12 +40,19 @@ def _assert_no_secondary_tensor_group(model: Module) -> None:
         assert param.ds_zero_param_process_group is None
 
 
+def _assert_secondary_tensor_size(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_secondary_tensor is not None
+        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
+
+
 #Large sweep along hidden dim, num_layers, and zpg of different sizes
 #Assert when zpg=1 that secondary group and tensors are invalid
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
+@pytest.mark.sequential
+@pytest.mark.parametrize("h_dim", [1024])
+@pytest.mark.parametrize("n_layers", [4, 9])
 @pytest.mark.parametrize("zpg", [1, 2, 4])
-class TesthpZeroConfigSweep(DistributedTest):
+class TestZeroPPConfigSweep(DistributedTest):
     world_size = 4
 
     def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
@@ -55,6 +62,8 @@ class TesthpZeroConfigSweep(DistributedTest):
                 "stage": 3,
                 "stage3_max_reuse_distance": 0,
                 "zero_hpz_partition_size": zpg,
+                "zero_quantized_weights": True,
+                "zero_quantized_gradients": True,
                 "contiguous_gradients": True,
                 "overlap_comm": True,
             },
@@ -78,53 +87,8 @@ class TesthpZeroConfigSweep(DistributedTest):
             _assert_no_secondary_tensor_group(model)
 
         for n, batch in enumerate(data_loader):
+            if n == 0 and zpg != 1:
+                _assert_secondary_tensor_size(model)
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
-
-
-def _assert_secondary_tensor_size(model: Module) -> None:
-    for _, param in model.named_parameters():
-        assert param.ds_secondary_tensor is not None
-        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
-
-
-#Tests that secondary tensors are available and are of right sizes
-@pytest.mark.parametrize("h_dim", [1024, 4000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-@pytest.mark.parametrize("zpg", [2, 4])
-class TestSecondaryTensorSize(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "zero_hpz_partition_size": zpg,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=4, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            _assert_secondary_tensor_size(model)
-            if n == 0: break
-- 
GitLab