Merge branch 'master' into olruwase/legacy_optimizer_fusion

6bea95e5 · Olatunji Ruwase · GitHub · 932268a4 · 001abe23 · 6bea95e5
8 changed file
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -54,7 +54,7 @@ def initialize(args,
            step(), state_dict(), and load_state_dict() methods

        mpu: Optional: A model parallelism unit object that implements
-            get_model/data_parallel_group/rank/size()
+            get_{model,data}_parallel_{rank,group,world_size}()

        dist_init_required: Optional: Initializes torch.distributed

@@ -128,6 +128,10 @@ def _add_core_arguments(parser):
                       type=str,
                       help='DeepSpeed json configuration file.')

+    group.add_argument('--deepscale_config',
+                       default=None,
+                       type=str,
+                       help='Deprecated DeepSpeed json configuration file.')
    return parser



--- a/deepspeed/pt/deepspeed_light.py
+++ b/deepspeed/pt/deepspeed_light.py
@@ -325,6 +325,14 @@ class DeepSpeedLight(Module):

    # Validate command line arguments
    def _do_args_sanity_check(self, args):
+        if hasattr(args, 'deepscale_config') and args.deepscale_config is not None:
+            logging.warning(
+                "************ --deepscale_config is deprecated, please use --deepspeed_config ************"
+            )
+            if hasattr(args, 'deepspeed_config'):
+                assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+            args.deepspeed_config = args.deepscale_config
+
        assert hasattr(args, 'local_rank') and type(args.local_rank) == int, \
            'DeepSpeed requires integer command line parameter --local_rank'


--- a/deepspeed/pt/deepspeed_run.py
+++ b/deepspeed/pt/deepspeed_run.py
@@ -15,6 +15,7 @@ import collections
 from copy import deepcopy

 DLTS_HOSTFILE = "/job/hostfile"
+EXPORT_ENVS = ["NCCL", "PYTHONPATH"]


 def parse_args(args=None):
@@ -305,13 +306,18 @@ def main(args=None):
        num_gpus_per_node = None

        curr_path = os.path.abspath('.')
+        if 'PYTHONPATH' in env:
+            env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH']
+        else:
+            env['PYTHONPATH'] = curr_path

-        nccl_export = ""
-        for nccl_var in filter(lambda x: "NCCL_" in x, env.keys()):
-            nccl_export += "export {}={}; ".format(nccl_var, env[nccl_var])
+        exports = ""
+        for var in env.keys():
+            if any(map(lambda name: name in var, EXPORT_ENVS)):
+                exports += "export {}={}; ".format(var, env[var])

        deepspeed_launch = [
-            nccl_export,
+            exports,
            "cd {};".format(curr_path),
            sys.executable,
            "-u",

--- a/docs/features.md
+++ b/docs/features.md
@@ -68,10 +68,11 @@ mpu.get_model_parallel_rank()
 mpu.get_model_parallel_group()
 mpu.get_model_parallel_world_size()

-mpu.get_data_parallel_rank/group/world_size()
+mpu.get_data_parallel_rank()
 mpu.get_data_parallel_group()
 mpu.get_data_parallel_world_size()
 ```
+
 ### Integration with Megatron-LM
 DeepSpeed is fully compatible with [Megatron](https://github.com/NVIDIA/Megatron-LM).
 Please see the [Megatron-LM tutorial](tutorials/MegatronGPT2Tutorial.md) for details.

--- a/install.sh
+++ b/install.sh
@@ -109,16 +109,11 @@ if [ "$third_party_install" == "1" ]; then
    sudo -H pip install third_party/apex/dist/apex*.whl
 fi
 if [ "$deepspeed_install" == "1" ]; then
-    echo "Installing deepspeed"
+    echo "Building deepspeed wheel"
    python setup.py bdist_wheel
 fi

 if [ "$local_only" == "1" ]; then
-    if [ "$third_party_install" == "1" ]; then
-        echo "Installing apex locally"
-        sudo -H pip uninstall -y apex
-        sudo -H pip install third_party/apex/dist/apex*.whl
-    fi
    if [ "$deepspeed_install" == "1" ]; then
        echo "Installing deepspeed"
        sudo -H pip uninstall -y deepspeed

--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
+import os
+import json
+import argparse
+import torch
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleModel, self).__init__()
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        if empty_grad:
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        hidden_dim = x
+        hidden_dim = self.linear(hidden_dim)
+        return self.cross_entropy_loss(hidden_dim, y)
+
+
+def random_dataloader(model, total_samples, hidden_dim, device):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
+    train_label = torch.empty(total_samples,
+                              dtype=torch.long,
+                              device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
+    return train_loader
+
+
+def create_config_from_dict(tmpdir, config_dict):
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+    with open(config_path, 'w') as fd:
+        json.dump(config_dict, fd)
+    return config_path
+
+
+def args_from_dict(tmpdir, config_dict):
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args(args='')
+    args.deepspeed = True
+    args.deepspeed_config = config_path
+    args.local_rank = 0
+    return args
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
 # A test on its own
 import torch
 import pytest
+import json
+import argparse
 from common import distributed_test
+from simple_model import SimpleModel, create_config_from_dict, random_dataloader
 import torch.distributed as dist

 # A test on its own
@@ -100,3 +103,54 @@ def test_batch_config(num_ranks, batch, micro_batch, gas, success):

    """Run batch config test """
    _test_batch_config(num_ranks, batch, micro_batch, gas, success)
+
+
+def test_temp_config_json(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+    }
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    config_json = json.load(open(config_path, 'r'))
+    assert 'train_batch_size' in config_json
+
+
+def test_deprecated_deepscale_config(tmpdir):
+    config_dict = {
+        "train_batch_size": 1,
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": 0.00015
+            }
+        },
+        "fp16": {
+            "enabled": True
+        }
+    }
+
+    config_path = create_config_from_dict(tmpdir, config_dict)
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args(args='')
+    args.deepscale_config = config_path
+    args.local_rank = 0
+
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1])
+    def _test_deprecated_deepscale_config(args, model, hidden_dim):
+        model, _, _,_ = deepspeed.initialize(args=args,
+                                             model=model,
+                                             model_parameters=model.parameters(),
+                                             dist_init_required=False)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=5,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
--- a/tests/unit/test_fp16.py
+++ b/tests/unit/test_fp16.py
@@ -5,67 +5,7 @@ import pytest
 import json
 import os
 from common import distributed_test
-
-
-def create_config_from_dict(tmpdir, config_dict):
-    config_path = os.path.join(tmpdir, 'temp_config.json')
-    with open(config_path, 'w') as fd:
-        json.dump(config_dict, fd)
-    return config_path
-
-
-class SimpleModel(torch.nn.Module):
-    def __init__(self, hidden_dim, empty_grad=False):
-        super(SimpleModel, self).__init__()
-        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
-        if empty_grad:
-            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
-        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        hidden_dim = x
-        hidden_dim = self.linear(hidden_dim)
-        return self.cross_entropy_loss(hidden_dim, y)
-
-
-def test_temp_config_json(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-    }
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    config_json = json.load(open(config_path, 'r'))
-    assert 'train_batch_size' in config_json
-
-
-def prepare_optimizer_parameters(model):
-    param_optimizer = list(model.named_parameters())
-    optimizer_grouped_parameters = [{
-        'params': [p for n,
-                   p in param_optimizer],
-        'weight_decay': 0.0
-    }]
-    return optimizer_grouped_parameters
-
-
-def get_data_loader(model, total_samples, hidden_dim, device):
-    batch_size = model.train_micro_batch_size_per_gpu()
-    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
-    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
-    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
-    return train_loader
-
-
-def get_args(tmpdir, config_dict):
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    parser = argparse.ArgumentParser()
-    args = parser.parse_args(args='')
-    args.deepspeed = True
-    args.deepspeed_config = config_path
-    args.local_rank = 0
-    return args
+from simple_model import SimpleModel, random_dataloader, args_from_dict


 def test_lamb_fp16_basic(tmpdir):
@@ -83,7 +23,7 @@ def test_lamb_fp16_basic(tmpdir):
            "enabled": True
        }
    }
-    args = get_args(tmpdir, config_dict)
+    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)
@@ -94,10 +34,10 @@ def test_lamb_fp16_basic(tmpdir):
                                             model=model,
                                             model_parameters=model.parameters(),
                                             dist_init_required=False)
-        data_loader = get_data_loader(model=model,
-                                      total_samples=50,
-                                      hidden_dim=hidden_dim,
-                                      device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
@@ -121,7 +61,7 @@ def test_lamb_fp16_empty_grad(tmpdir):
            "enabled": True
        }
    }
-    args = get_args(tmpdir, config_dict)
+    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)
@@ -132,10 +72,10 @@ def test_lamb_fp16_empty_grad(tmpdir):
                                             model=model,
                                             model_parameters=model.parameters(),
                                             dist_init_required=False)
-        data_loader = get_data_loader(model=model,
-                                      total_samples=50,
-                                      hidden_dim=hidden_dim,
-                                      device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
@@ -152,7 +92,7 @@ def test_adamw_fp16_basic(tmpdir):
            "enabled": True
        }
    }
-    args = get_args(tmpdir, config_dict)
+    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=False)
@@ -164,10 +104,10 @@ def test_adamw_fp16_basic(tmpdir):
                                             model=model,
                                             optimizer=optimizer,
                                             dist_init_required=False)
-        data_loader = get_data_loader(model=model,
-                                      total_samples=50,
-                                      hidden_dim=hidden_dim,
-                                      device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)
@@ -184,7 +124,7 @@ def test_adamw_fp16_empty_grad(tmpdir):
            "enabled": True
        }
    }
-    args = get_args(tmpdir, config_dict)
+    args = args_from_dict(tmpdir, config_dict)
    hidden_dim = 10

    model = SimpleModel(hidden_dim, empty_grad=True)
@@ -196,10 +136,10 @@ def test_adamw_fp16_empty_grad(tmpdir):
                                             model=model,
                                             optimizer=optimizer,
                                             dist_init_required=False)
-        data_loader = get_data_loader(model=model,
-                                      total_samples=50,
-                                      hidden_dim=hidden_dim,
-                                      device=model.device)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
        for n, batch in enumerate(data_loader):
            loss = model(batch[0], batch[1])
            model.backward(loss)