未验证 提交 865104be 编写于 作者: O Olatunji Ruwase 提交者: GitHub

Support optimizer AdamW type (#670)

上级 f032e56f
......@@ -27,17 +27,18 @@ from ..profiling.config import DeepSpeedFlopsProfilerConfig
TENSOR_CORE_ALIGN_SIZE = 8
ADAM_OPTIMIZER = 'adam'
ADAMW_OPTIMIZER = 'adamw'
LAMB_OPTIMIZER = 'lamb'
ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
DEEPSPEED_OPTIMIZERS = [
ADAM_OPTIMIZER,
ADAMW_OPTIMIZER,
LAMB_OPTIMIZER,
ONEBIT_ADAM_OPTIMIZER,
]
# extra optimizer parameters for adam
# extra optimizer parameters for adam/adamw
TORCH_ADAM_PARAM = "torch_adam"
ADAM_W_MODE_PARAM = "adam_w_mode"
class DeepSpeedConfigError(Exception):
......
......@@ -19,8 +19,8 @@ from deepspeed.runtime.activation_checkpointing import checkpointing as activati
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM, ADAM_W_MODE_PARAM
ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
TORCH_ADAM_PARAM
from deepspeed.runtime.dataloader import DeepSpeedDataLoader
from deepspeed.runtime.constants import \
......@@ -582,10 +582,9 @@ class DeepSpeedEngine(Module):
"'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
)
if self.optimizer_name() == ADAM_OPTIMIZER:
if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE_PARAM, True)
adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER
# zero-offload torch-adam adam_w_mode optimizer
# T|F T T torch.optim.AdamW
# T|F T F torch.optim.Adam
......@@ -603,7 +602,7 @@ class DeepSpeedEngine(Module):
**optimizer_parameters,
adamw_mode=adam_w_mode)
else:
optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode
optimizer_parameters['adam_w_mode'] = adam_w_mode
optimizer = FusedAdam(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == LAMB_OPTIMIZER:
......
......@@ -23,7 +23,12 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
return my_group
ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam]
ZERO_SUPPORTED_OPTIMIZERS = [
torch.optim.Adam,
torch.optim.AdamW,
FusedAdam,
DeepSpeedCPUAdam
]
# Add apex FusedAdam to supported list if apex is installed
try:
......
......@@ -35,9 +35,9 @@ def test_lamb_fp32_grad_clip(tmpdir):
@distributed_test(world_size=[1, 2])
def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -73,9 +73,9 @@ def test_lamb_fp16_basic(tmpdir):
@distributed_test(world_size=[1, 2])
def _test_lamb_fp16_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -110,9 +110,9 @@ def test_lamb_fp16_empty_grad(tmpdir):
@distributed_test(world_size=[2])
def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -147,9 +147,9 @@ def test_adam_fp32_empty_grad(tmpdir):
@distributed_test(world_size=[2])
def _test_adam_fp32_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -179,9 +179,9 @@ def test_adamw_fp16_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -210,10 +210,10 @@ def test_dict_config_adamw_fp16_basic():
@distributed_test(world_size=[1])
def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
config_params=config_dict)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -245,9 +245,9 @@ def test_adamw_fp16_empty_grad(tmpdir):
@distributed_test(world_size=[1])
def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
optimizer = torch.optim.AdamW(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -270,7 +270,7 @@ def test_adamw_fp16_empty_grad(tmpdir):
True),
])
def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 1,
......@@ -311,9 +311,9 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo
@distributed_test(world_size=[1])
def _test_adam_fp16_zero_onecycle_compatibility(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -338,7 +338,7 @@ def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offlo
True),
])
def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
......@@ -364,9 +364,9 @@ def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
......@@ -407,9 +407,9 @@ def test_zero_static_scale_deprecated_format(tmpdir):
def _test_zero_static_scale(args):
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
# Ensure the static scaler is configured.
assert optim.dynamic_loss_scale == False
......@@ -438,7 +438,7 @@ def test_zero_static_scale_deprecated_format(tmpdir):
True),
])
def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_batch_size": 4,
......@@ -460,10 +460,10 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
model = SimpleModel(hidden_dim, empty_grad=True)
optimizer = SimpleOptimizer(model.parameters())
with pytest.raises(AssertionError):
model, optim, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())
model, optim, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer,
model_parameters=model.parameters())
_test_zero_allow_untested_optimizer(args)
......@@ -478,7 +478,7 @@ def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
True),
])
def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
#if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# if use_cpu_offload and not deepspeed.ops.__installed_ops__['cpu-adam']:
# pytest.skip("cpu-adam is not installed")
config_dict = {
"train_micro_batch_size_per_gpu": 1,
......@@ -536,9 +536,9 @@ def test_adam_amp_basic(tmpdir):
@distributed_test(world_size=[1])
def _test_adam_amp_basic(args, model, hidden_dim):
optimizer = torch.optim.Adam(params=model.parameters())
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=optimizer)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -574,9 +574,9 @@ def test_lamb_amp_basic(tmpdir):
@distributed_test(world_size=[1, 2])
def _test_lamb_amp_basic(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -613,9 +613,9 @@ def test_adam_amp_o2(tmpdir):
@distributed_test(world_size=[1, 2])
def _test_adam_amp_o2(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -652,9 +652,9 @@ def test_adam_amp_o2_empty_grad(tmpdir):
@distributed_test(world_size=[2])
def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -694,8 +694,8 @@ def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_construct
def _test_zero_supported_client_optimizer(args, model, optimizer_constructor):
client_optimizer = optimizer_constructor(params=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
optimizer=client_optimizer)
model=model,
optimizer=client_optimizer)
_test_zero_supported_client_optimizer(args=args,
model=model,
......@@ -732,9 +732,9 @@ def test_zero2_reduce_scatter_off(tmpdir):
@distributed_test(world_size=[2])
def _helper(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
......@@ -745,3 +745,53 @@ def test_zero2_reduce_scatter_off(tmpdir):
model.step()
_helper(args=args, model=model, hidden_dim=hidden_dim)
@pytest.mark.parametrize('adam_type, torch_impl',
[('Adam',
True),
('Adam',
False),
('AdamW',
True),
('AdamW',
False)])
def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
"fp16": {
"enabled": True,
"initial_scale_power": 10
},
"optimizer": {
"type": adam_type,
"torch_adam": torch_impl,
"params": {
"lr": 0.00015
}
}
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False)
@distributed_test(world_size=[1])
def _test_fp16_adam_types(args, model, hidden_dim):
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=10,
hidden_dim=hidden_dim,
device=model.device)
for _, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
_test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册