未验证 提交 6bea95e5 编写于 作者: O Olatunji Ruwase 提交者: GitHub

Merge branch 'master' into olruwase/legacy_optimizer_fusion

......@@ -54,7 +54,7 @@ def initialize(args,
step(), state_dict(), and load_state_dict() methods
mpu: Optional: A model parallelism unit object that implements
get_model/data_parallel_group/rank/size()
get_{model,data}_parallel_{rank,group,world_size}()
dist_init_required: Optional: Initializes torch.distributed
......@@ -128,6 +128,10 @@ def _add_core_arguments(parser):
type=str,
help='DeepSpeed json configuration file.')
group.add_argument('--deepscale_config',
default=None,
type=str,
help='Deprecated DeepSpeed json configuration file.')
return parser
......
......@@ -325,6 +325,14 @@ class DeepSpeedLight(Module):
# Validate command line arguments
def _do_args_sanity_check(self, args):
if hasattr(args, 'deepscale_config') and args.deepscale_config is not None:
logging.warning(
"************ --deepscale_config is deprecated, please use --deepspeed_config ************"
)
if hasattr(args, 'deepspeed_config'):
assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
args.deepspeed_config = args.deepscale_config
assert hasattr(args, 'local_rank') and type(args.local_rank) == int, \
'DeepSpeed requires integer command line parameter --local_rank'
......
......@@ -15,6 +15,7 @@ import collections
from copy import deepcopy
DLTS_HOSTFILE = "/job/hostfile"
EXPORT_ENVS = ["NCCL", "PYTHONPATH"]
def parse_args(args=None):
......@@ -305,13 +306,18 @@ def main(args=None):
num_gpus_per_node = None
curr_path = os.path.abspath('.')
if 'PYTHONPATH' in env:
env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH']
else:
env['PYTHONPATH'] = curr_path
nccl_export = ""
for nccl_var in filter(lambda x: "NCCL_" in x, env.keys()):
nccl_export += "export {}={}; ".format(nccl_var, env[nccl_var])
exports = ""
for var in env.keys():
if any(map(lambda name: name in var, EXPORT_ENVS)):
exports += "export {}={}; ".format(var, env[var])
deepspeed_launch = [
nccl_export,
exports,
"cd {};".format(curr_path),
sys.executable,
"-u",
......
......@@ -68,10 +68,11 @@ mpu.get_model_parallel_rank()
mpu.get_model_parallel_group()
mpu.get_model_parallel_world_size()
mpu.get_data_parallel_rank/group/world_size()
mpu.get_data_parallel_rank()
mpu.get_data_parallel_group()
mpu.get_data_parallel_world_size()
```
### Integration with Megatron-LM
DeepSpeed is fully compatible with [Megatron](https://github.com/NVIDIA/Megatron-LM).
Please see the [Megatron-LM tutorial](tutorials/MegatronGPT2Tutorial.md) for details.
......
......@@ -109,16 +109,11 @@ if [ "$third_party_install" == "1" ]; then
sudo -H pip install third_party/apex/dist/apex*.whl
fi
if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed"
echo "Building deepspeed wheel"
python setup.py bdist_wheel
fi
if [ "$local_only" == "1" ]; then
if [ "$third_party_install" == "1" ]; then
echo "Installing apex locally"
sudo -H pip uninstall -y apex
sudo -H pip install third_party/apex/dist/apex*.whl
fi
if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed"
sudo -H pip uninstall -y deepspeed
......
import os
import json
import argparse
import torch
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
if empty_grad:
self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
hidden_dim = x
hidden_dim = self.linear(hidden_dim)
return self.cross_entropy_loss(hidden_dim, y)
def random_dataloader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples,
dtype=torch.long,
device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
return train_loader
def create_config_from_dict(tmpdir, config_dict):
config_path = os.path.join(tmpdir, 'temp_config.json')
with open(config_path, 'w') as fd:
json.dump(config_dict, fd)
return config_path
def args_from_dict(tmpdir, config_dict):
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepspeed = True
args.deepspeed_config = config_path
args.local_rank = 0
return args
# A test on its own
import torch
import pytest
import json
import argparse
from common import distributed_test
from simple_model import SimpleModel, create_config_from_dict, random_dataloader
import torch.distributed as dist
# A test on its own
......@@ -100,3 +103,54 @@ def test_batch_config(num_ranks, batch, micro_batch, gas, success):
"""Run batch config test """
_test_batch_config(num_ranks, batch, micro_batch, gas, success)
def test_temp_config_json(tmpdir):
config_dict = {
"train_batch_size": 1,
}
config_path = create_config_from_dict(tmpdir, config_dict)
config_json = json.load(open(config_path, 'r'))
assert 'train_batch_size' in config_json
def test_deprecated_deepscale_config(tmpdir):
config_dict = {
"train_batch_size": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"fp16": {
"enabled": True
}
}
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepscale_config = config_path
args.local_rank = 0
hidden_dim = 10
model = SimpleModel(hidden_dim)
@distributed_test(world_size=[1])
def _test_deprecated_deepscale_config(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters(),
dist_init_required=False)
data_loader = random_dataloader(model=model,
total_samples=5,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
_test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
......@@ -5,67 +5,7 @@ import pytest
import json
import os
from common import distributed_test
def create_config_from_dict(tmpdir, config_dict):
config_path = os.path.join(tmpdir, 'temp_config.json')
with open(config_path, 'w') as fd:
json.dump(config_dict, fd)
return config_path
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
if empty_grad:
self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
hidden_dim = x
hidden_dim = self.linear(hidden_dim)
return self.cross_entropy_loss(hidden_dim, y)
def test_temp_config_json(tmpdir):
config_dict = {
"train_batch_size": 1,
}
config_path = create_config_from_dict(tmpdir, config_dict)
config_json = json.load(open(config_path, 'r'))
assert 'train_batch_size' in config_json
def prepare_optimizer_parameters(model):
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [{
'params': [p for n,
p in param_optimizer],
'weight_decay': 0.0
}]
return optimizer_grouped_parameters
def get_data_loader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples,
dtype=torch.long,
device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
return train_loader
def get_args(tmpdir, config_dict):
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepspeed = True
args.deepspeed_config = config_path
args.local_rank = 0
return args
from simple_model import SimpleModel, random_dataloader, args_from_dict
def test_lamb_fp16_basic(tmpdir):
......@@ -83,7 +23,7 @@ def test_lamb_fp16_basic(tmpdir):
"enabled": True
}
}
args = get_args(tmpdir, config_dict)
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False)
......@@ -94,10 +34,10 @@ def test_lamb_fp16_basic(tmpdir):
model=model,
model_parameters=model.parameters(),
dist_init_required=False)
data_loader = get_data_loader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
......@@ -121,7 +61,7 @@ def test_lamb_fp16_empty_grad(tmpdir):
"enabled": True
}
}
args = get_args(tmpdir, config_dict)
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
......@@ -132,10 +72,10 @@ def test_lamb_fp16_empty_grad(tmpdir):
model=model,
model_parameters=model.parameters(),
dist_init_required=False)
data_loader = get_data_loader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
......@@ -152,7 +92,7 @@ def test_adamw_fp16_basic(tmpdir):
"enabled": True
}
}
args = get_args(tmpdir, config_dict)
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False)
......@@ -164,10 +104,10 @@ def test_adamw_fp16_basic(tmpdir):
model=model,
optimizer=optimizer,
dist_init_required=False)
data_loader = get_data_loader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
......@@ -184,7 +124,7 @@ def test_adamw_fp16_empty_grad(tmpdir):
"enabled": True
}
}
args = get_args(tmpdir, config_dict)
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True)
......@@ -196,10 +136,10 @@ def test_adamw_fp16_empty_grad(tmpdir):
model=model,
optimizer=optimizer,
dist_init_required=False)
data_loader = get_data_loader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册