未验证 提交 9647ea79 编写于 作者: M Michael Wyatt 提交者: GitHub

Add MuP optimizers (#2043)

* added paths for mup optimizers

* added tests

* formatting

* Add license, fix missing distributed test, formatting

* Add mpi4py to confirm tests work

* Undo requirements change

* Move to runtime folder

* Rework to match new format

* missing comma

* hidden dim fix

---------
Co-authored-by: NLogan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: NLogan Adams <loadams@microsoft.com>
上级 d6c2e6b0
......@@ -73,9 +73,12 @@ LAMB_OPTIMIZER = 'lamb'
ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
ZERO_ONE_ADAM_OPTIMIZER = 'zerooneadam'
ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
MUADAM_OPTIMIZER = 'muadam'
MUADAMW_OPTIMIZER = 'muadamw'
MUSGD_OPTIMIZER = 'musgd'
DEEPSPEED_OPTIMIZERS = [
ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER,
ZERO_ONE_ADAM_OPTIMIZER
ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER
]
# extra optimizer parameters for adam/adamw
......
......@@ -37,7 +37,7 @@ from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER
TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, MUSGD_OPTIMIZER
from deepspeed.runtime.dataloader import DeepSpeedDataLoader
from deepspeed.runtime.constants import \
......@@ -1298,6 +1298,24 @@ class DeepSpeedEngine(Module):
optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
if not self.fp16_enabled():
logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
elif self.optimizer_name() == MUADAM_OPTIMIZER:
try:
from mup import MuAdam
except ImportError:
logger.error(f"Install mup to use MuAdam optimizer")
optimizer = MuAdam(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == MUADAMW_OPTIMIZER:
try:
from mup import MuAdamW
except ImportError:
logger.error(f"Install mup to use MuAdamW optimizer")
optimizer = MuAdamW(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == MUSGD_OPTIMIZER:
try:
from mup import MuSGD
except ImportError:
logger.error(f"Install mup to use MuSGD optimizer")
optimizer = MuSGD(model_parameters, **optimizer_parameters)
else:
torch_optimizer = getattr(torch.optim, self.optimizer_name())
optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
......
......@@ -4,6 +4,7 @@ coverage
docutils<0.18
future
importlib-metadata>=4
mup
pre-commit>=2.20.0
pytest
pytest-forked
......
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import deepspeed
import torch
import pytest
from unit.common import DistributedTest
from unit.simple_model import SimpleModel, random_dataloader
from mup.shape import set_base_shapes
@pytest.mark.parametrize("optimizer, expected_opt_class", [("MuAdam", torch.optim.Adam),
("MuAdamW", torch.optim.AdamW), ("MuSGD", torch.optim.SGD)]) # yapf: disable
@pytest.mark.parametrize("zero_offload", [True, False]) # yapf: disable
class TestMuPOptimizers(DistributedTest):
world_size = 1
reuse_dist_env = True
def test(self, optimizer, expected_opt_class, zero_offload):
config_dict = {
"train_batch_size": 2,
"steps_per_print": 1,
"zero_allow_untested_optimizer": True,
"optimizer": {
"type": optimizer,
"params": {
"lr": 0.00015,
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": True
},
"zero_optimization": {
"stage": 2,
"cpu_offload": zero_offload
}
}
hidden_dim = 10
model = SimpleModel(hidden_dim)
set_base_shapes(model, None)
model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
ds_optimizer = model.optimizer.optimizer
assert isinstance(ds_optimizer, expected_opt_class)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册