diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py index b5c9ae3ab3d6d10711d75e668bf9cb94becf8719..cff4219d916b19b161ccfef7dc146bf116ecc862 100644 --- a/deepspeed/runtime/hybrid_engine.py +++ b/deepspeed/runtime/hybrid_engine.py @@ -431,8 +431,9 @@ class DeepSpeedHybridEngine(DeepSpeedEngine): super().step(lr_kwargs=lr_kwargs) if len(self._inference_containers) > 0: - for inference_container in self._inference_containers: - inference_container.reset_params() + if not self.Z3_enabled: + for inference_container in self._inference_containers: + inference_container.reset_params() if self._training_start_time is not None: self._training_latency += (time.time() - self._training_start_time) diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py index 8db03bb6eb41b223171435f1280eb1ef8e354ce4..b54698fb16fd2a7689eff2943bb855ddf3f49981 100644 --- a/tests/unit/hybrid_engine/test_he_lora.py +++ b/tests/unit/hybrid_engine/test_he_lora.py @@ -9,8 +9,8 @@ import torch import torch.nn.functional as F import pytest import deepspeed +from deepspeed.runtime.zero import GatheredParameters from deepspeed.ops.op_builder import OpBuilder -from deepspeed.ops.adam import FusedAdam from deepspeed.utils import safe_get_full_grad import numpy.testing as npt from unit.common import DistributedTest @@ -109,7 +109,9 @@ def only_optimize_lora_parameters(model): @pytest.mark.seq_inference @pytest.mark.parametrize("batch_size", [1], ids=["bsz=1"]) +@pytest.mark.parametrize("zero_stage", [2, 3], ids=["zero_stage=2", "zero_stage=3"]) @pytest.mark.parametrize("model_name", ["EleutherAI/gpt-neo-125m", "facebook/opt-350m", "bigscience/bloom-560m"]) +@pytest.mark.parametrize("offload_device", ["none", "cpu"]) class TestHybridEngineLoRA(DistributedTest): world_size = 1 @@ -139,9 +141,8 @@ class TestHybridEngineLoRA(DistributedTest): else: raise NotImplementedError(f"batch_size {batch_size} not implemented") - def test_lora(self, batch_size, model_name): + def test_lora(self, batch_size, model_name, zero_stage, offload_device): local_rank = int(os.getenv("LOCAL_RANK", "0")) - model = self.get_model(model_name) tokenizer = self.get_tokenizer(model_name) train_sentences = self.get_train_sentences(batch_size) @@ -149,10 +150,33 @@ class TestHybridEngineLoRA(DistributedTest): # Inject LoRA model = convert_linear_layer_to_lora(model, "", 8) model = only_optimize_lora_parameters(model) - optim = FusedAdam([p for p in model.parameters() if p.requires_grad], lr=1.0, betas=(0.9, 0.95)) - ds_config = {"train_batch_size": batch_size, "bfp16": {"enabled": True}, "hybrid_engine": {"enabled": True}} - model, *_ = deepspeed.initialize(model=model, optimizer=optim, config=ds_config) + ds_config = { + "optimizer": { + "type": "Adam", + "params": { + "lr": 1.0, + "betas": [0.9, 0.95] + } + }, + "train_batch_size": batch_size, + "fp16": { + "enabled": True, + "initial_scale_power": 12 + }, + "hybrid_engine": { + "enabled": True, + "pin_parameters": True + }, + "zero_optimization": { + "stage": zero_stage, + "offload_optimizer": { + "device": offload_device + } + } + } + + model, *_ = deepspeed.initialize(model=model, config=ds_config) # Verify gradient norm is larger than 0 before_grad_update_layer0_params = [ @@ -187,7 +211,9 @@ class TestHybridEngineLoRA(DistributedTest): # Verify fuse will mutate layer_params model.eval() - model.fuse_lora_weight() + with GatheredParameters(model.parameters()): + model.fuse_lora_weight() + after_grad_update_layer0_params_lora_fused = [ ele.detach().cpu().float().numpy() for ele in model.layer_params[0] if ele is not None and len(ele.shape) > 1 @@ -196,4 +222,6 @@ class TestHybridEngineLoRA(DistributedTest): for lhs, rhs in zip(before_grad_update_layer0_params, after_grad_update_layer0_params_lora_fused): with pytest.raises(AssertionError): npt.assert_allclose(lhs, rhs, 1E-5, 1E-5) - model.unfuse_lora_weight() + + with GatheredParameters(model.parameters()): + model.unfuse_lora_weight()