diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index 34f9907c9aac7c77263346647f22e37d78779e61..c8e1192d1d560e23bae5f4637866c197ea65231c 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 8e8566d79669cc19af0c9122d38c13ba60cab08b..6528a01bc60fd1c1b97354b2d2e437a589026403 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index b8f19b5a11853b7112857ccf524dde3ab1d19162..c5c60751fb76a697411d477f227c6c11d2a839ad 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 983598e3f6d384b12e0c3a696787f65c46bf215c..0aca9d3d716a6cfb24fcd474fca7e15c49ba5fe2 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
index c88c87cf913940f34180878214210fb5003dcca9..ef2ef269be15bc14303180f86038bfe1bf7f644d 100644
--- a/.github/workflows/nv-megatron.yml
+++ b/.github/workflows/nv-megatron.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index f8bf58c32bc838b2d96a89a32908e4552965fd16..b4ba8cef274d451a1c626cf1c504e554bcb600ac 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip3 install -U --cache-dir /blob/torch_cache torch
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -54,4 +54,4 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "CPU or local" ./
+          TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m "deepspeed" ./
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index d7adb107e46153136699bacd7efeefdd53472b5c..309514e7eb4cc89db00c6cb0e13b76798672f326 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 56e6b48d766ea4c2f3a1b699ff3fb1dec52d1331..91493a7866edd8c83b9025f7812ceb54b076a46a 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir /blob/torch_cache torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml
index 064e77742f49bec53e21ac9210d9dc9379872162..6ae41651e9d2e0e03263601bfc899e04dc1435c3 100644
--- a/.github/workflows/nv-torch19-p40.yml
+++ b/.github/workflows/nv-torch19-p40.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml
index 615685c1546b4a81fbfb34e408ed9243f5293e00..fa4d58d13f7ed936e4bbc2ec35607f885ee20e7a 100644
--- a/.github/workflows/nv-torch19-v100.yml
+++ b/.github/workflows/nv-torch19-v100.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index e433e44bdefd711530da63e755c3993a668b1ba9..834928288393316ea8ba5163276534fd9d23b576 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install --no-cache-dir torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
+          pip install -U --cache-dir /blob/torch_cache torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/tests/unit/runtime/zero/test_qgzero.py b/tests/unit/runtime/zero/test_qgzero.py
deleted file mode 100644
index ccd0f166d305f557a5415fc17728f68455eb42e7..0000000000000000000000000000000000000000
--- a/tests/unit/runtime/zero/test_qgzero.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qgZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_gradients": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
diff --git a/tests/unit/runtime/zero/test_qwzero.py b/tests/unit/runtime/zero/test_qwzero.py
deleted file mode 100644
index 71a0914e1a567e1f5477ca293d872bf1909d22bb..0000000000000000000000000000000000000000
--- a/tests/unit/runtime/zero/test_qwzero.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import pytest
-import deepspeed.comm as dist
-from unit.common import DistributedTest
-from unit.simple_model import random_dataloader
-
-import deepspeed
-import torch.nn as nn
-
-
-class NNModel(nn.Module):
-
-    def __init__(self, h_dim=1024, n_layers=2):
-        super(NNModel, self).__init__()
-        self.layers = nn.ModuleList([nn.Linear(h_dim, h_dim) for i in range(n_layers)])
-        self.cross_entropy_loss = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        for layer in self.layers:
-            x = layer(x)
-        return self.cross_entropy_loss(x, y)
-
-
-#Large sweep along hidden dim, num_layers of different sizes for qwZeRO.
-@pytest.mark.parametrize("h_dim", [1024, 2048])
-@pytest.mark.parametrize("n_layers", [8, 20])
-class TesthpZeroConfigSweep(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "reduce_scatter": True,
-                "zero_quantized_weights": True
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
diff --git a/tests/unit/runtime/zero/test_hpzero.py b/tests/unit/runtime/zero/test_zeropp.py
similarity index 63%
rename from tests/unit/runtime/zero/test_hpzero.py
rename to tests/unit/runtime/zero/test_zeropp.py
index 1d61d3c50a104adfcbc7d97b19fe42dda620a7eb..27ec7269afc6939366ed5eb16f610a1de8b3e96e 100644
--- a/tests/unit/runtime/zero/test_hpzero.py
+++ b/tests/unit/runtime/zero/test_zeropp.py
@@ -40,12 +40,19 @@ def _assert_no_secondary_tensor_group(model: Module) -> None:
         assert param.ds_zero_param_process_group is None
 
 
+def _assert_secondary_tensor_size(model: Module) -> None:
+    for _, param in model.named_parameters():
+        assert param.ds_secondary_tensor is not None
+        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
+
+
 #Large sweep along hidden dim, num_layers, and zpg of different sizes
 #Assert when zpg=1 that secondary group and tensors are invalid
-@pytest.mark.parametrize("h_dim", [1024, 2000])
-@pytest.mark.parametrize("n_layers", [8, 20])
+@pytest.mark.sequential
+@pytest.mark.parametrize("h_dim", [1024])
+@pytest.mark.parametrize("n_layers", [4, 9])
 @pytest.mark.parametrize("zpg", [1, 2, 4])
-class TesthpZeroConfigSweep(DistributedTest):
+class TestZeroPPConfigSweep(DistributedTest):
     world_size = 4
 
     def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
@@ -55,6 +62,8 @@ class TesthpZeroConfigSweep(DistributedTest):
                 "stage": 3,
                 "stage3_max_reuse_distance": 0,
                 "zero_hpz_partition_size": zpg,
+                "zero_quantized_weights": True,
+                "zero_quantized_gradients": True,
                 "contiguous_gradients": True,
                 "overlap_comm": True,
             },
@@ -78,53 +87,8 @@ class TesthpZeroConfigSweep(DistributedTest):
             _assert_no_secondary_tensor_group(model)
 
         for n, batch in enumerate(data_loader):
+            if n == 0 and zpg != 1:
+                _assert_secondary_tensor_size(model)
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
-
-
-def _assert_secondary_tensor_size(model: Module) -> None:
-    for _, param in model.named_parameters():
-        assert param.ds_secondary_tensor is not None
-        assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
-
-
-#Tests that secondary tensors are available and are of right sizes
-@pytest.mark.parametrize("h_dim", [1024, 4000])
-@pytest.mark.parametrize("n_layers", [8, 20])
-@pytest.mark.parametrize("zpg", [2, 4])
-class TestSecondaryTensorSize(DistributedTest):
-    world_size = 4
-
-    def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "zero_hpz_partition_size": zpg,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        model = NNModel(h_dim, n_layers)
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
-        data_loader = random_dataloader(model=model, total_samples=4, hidden_dim=h_dim, device=model.device)
-        dist.barrier()
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            _assert_secondary_tensor_size(model)
-            if n == 0: break