PLD release (#513)

* Progressive layer dropping docs (#499) * test * Adding tutorial and news page for pld * updating the tutorial and posts of PLD * update the finetune tutorial * Update PLD tutorial (#512) * Update installation instructions * Format fix * ZeRO tutorial * Format fixes * ZeRO-Offload * ZeRO and ZeRO-Offload tutorials * Update navigation page * Format fixes * Add yuxhe feedback * Fix blog post link * Fix OneBit-Adam link Tweak scheduler example * Fix date link * Add DeepSpeed_Adam * Add PLD tutorial to navigation Co-authored-by: N Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: N Jeff Rasley <jerasley@microsoft.com> * updating the pld docs * DeepSpeed implementation of PLD (#508) * DeepSpeed implementation of PLD * Format fixes * Formatting fixes * Fix broken url * Address PR feedback * Bump DSE Co-authored-by: N Minjia Zhang <33713995+minjiaz@users.noreply.github.com> Co-authored-by: N Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: N Jeff Rasley <jerasley@microsoft.com> Co-authored-by: N Minjia Zhang <minjiaz@microsoft.com>

PLD release (#513)
* Progressive layer dropping docs (#499) * test * Adding tutorial and news page for pld * updating the tutorial and posts of PLD * update the finetune tutorial * Update PLD tutorial (#512) * Update installation instructions * Format fix * ZeRO tutorial * Format fixes * ZeRO-Offload * ZeRO and ZeRO-Offload tutorials * Update navigation page * Format fixes * Add yuxhe feedback * Fix blog post link * Fix OneBit-Adam link Tweak scheduler example * Fix date link * Add DeepSpeed_Adam * Add PLD tutorial to navigation Co-authored-by: N Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: N Jeff Rasley <jerasley@microsoft.com> * updating the pld docs * DeepSpeed implementation of PLD (#508) * DeepSpeed implementation of PLD * Format fixes * Formatting fixes * Fix broken url * Address PR feedback * Bump DSE Co-authored-by: N Minjia Zhang <33713995+minjiaz@users.noreply.github.com> Co-authored-by: N Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: N Jeff Rasley <jerasley@microsoft.com> Co-authored-by: N Minjia Zhang <minjiaz@microsoft.com>
be1147c0 · Olatunji Ruwase · GitHub · e082d475 · fa1d1a71 · a79272cc
11 changed file
--- a/DeepSpeedExamples @ fa1d1a71
+++ b/DeepSpeedExamples @ fa1d1a71
-Subproject commit a79272cc8b8f0c5b66c803e581a1355341eacb77
+Subproject commit fa1d1a71c48623db8a091d9cf636a5fe3b8f43c7
--- a/README.md
+++ b/README.md
@@ -155,8 +155,7 @@ all repos using our CLA.
 This project has adopted the [Microsoft Open Source Code of
 Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
 [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or
-comments.
+[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

 # Publications
 1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. [ArXiv:1910.02054](https://arxiv.org/abs/1910.02054)
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -30,6 +30,24 @@ TORCH_ADAM_PARAM = "torch_adam"
 ADAM_W_MODE_PARAM = "adam_w_mode"


+def get_pld_enabled(param_dict):
+    if PROGRESSIVE_LAYER_DROP in param_dict.keys():
+        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP],
+                                PLD_ENABLED,
+                                PLD_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_pld_params(param_dict):
+    if PROGRESSIVE_LAYER_DROP in param_dict.keys():
+        pld_params = copy.copy(param_dict[PROGRESSIVE_LAYER_DROP])
+        pld_params.pop(PLD_ENABLED)
+        return pld_params
+    else:
+        return False
+
+
 def get_amp_enabled(param_dict):
    if AMP in param_dict.keys():
        return get_scalar_param(param_dict[AMP], AMP_ENABLED, AMP_ENABLED_DEFAULT)
@@ -542,6 +560,9 @@ class DeepSpeedConfig(object):
        self.sparse_attention = get_sparse_attention(param_dict)
        self.pipeline = get_pipeline_config(param_dict)

+        self.pld_enabled = get_pld_enabled(param_dict)
+        self.pld_params = get_pld_params(param_dict)
+
    def _batch_assertion(self):

        train_batch = self.train_batch_size

--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -291,3 +291,16 @@ TENSORBOARD_OUTPUT_PATH_DEFAULT = ""
 # Tensorboard job name
 TENSORBOARD_JOB_NAME = "job_name"
 TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"
+
+# Progressive Layer Drop (PLD)
+PROGRESSIVE_LAYER_DROP = "progressive_layer_drop"
+
+# PLD enable signal
+PLD_ENABLED = "enabled"
+PLD_ENABLED_DEFAULT = False
+
+PLD_THETA = "theta"
+PLD_THETA_DEFAULT = 1.0
+
+PLD_GAMMA = "gamma"
+PLD_GAMMA_DEFAULT = 0.001
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -26,13 +26,14 @@ from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
 from deepspeed.runtime.constants import \
    ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
-    TORCH_DISTRIBUTED_DEFAULT_PORT
+    TORCH_DISTRIBUTED_DEFAULT_PORT, PLD_THETA, PLD_GAMMA
 from deepspeed.runtime.zero.constants import \
    ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.runtime.csr_tensor import CSRTensor
 import deepspeed.runtime.lr_schedules as lr_schedules
 from deepspeed.utils import logger, log_dist
 from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop

 from .utils import ensure_directory_exists

@@ -127,6 +128,7 @@ class DeepSpeedEngine(Module):
        self.loaded_checkpoint_mp_world_size = None
        self.loaded_checkpoint_dp_world_size = None
        self.enable_backward_allreduce = True
+        self.progressive_layer_drop = None

        if dist_init_required is None:
            dist_init_required = not dist.is_initialized()
@@ -192,10 +194,13 @@ class DeepSpeedEngine(Module):
        self.save_zero_checkpoint = False
        self._configure_checkpointing(dist_init_required)

+        if self.pld_enabled():
+            self.progressive_layer_drop = self._configure_progressive_layer_drop()
+
        if self.global_rank == 0:
-            self._config.print('DeepSpeedLight configuration')
+            self._config.print('DeepSpeedEngine configuration')
            if self.dump_state():
-                print_configuration(self, 'DeepSpeedLight')
+                print_configuration(self, 'DeepSpeedEngine')

    def _mpi_check(self, args, dist_init_required):
        if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
@@ -236,6 +241,18 @@ class DeepSpeedEngine(Module):
                assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
                    world_size, dist.get_world_size())

+    def pld_enabled(self):
+        return self._config.pld_enabled
+
+    def pld_params(self):
+        return self._config.pld_params
+
+    def pld_theta(self):
+        return self.pld_params()[PLD_THETA]
+
+    def pld_gamma(self):
+        return self.pld_params()[PLD_GAMMA]
+
    def tensorboard_enabled(self):
        return self._config.tensorboard_enabled

@@ -666,6 +683,11 @@ class DeepSpeedEngine(Module):

        return optimizer

+    def _configure_progressive_layer_drop(self):
+        pld = ProgressiveLayerDrop(theta=self.pld_theta(), gamma=self.pld_gamma())
+
+        return pld
+
    def deepspeed_io(self,
                     dataset,
                     batch_size=None,
@@ -751,6 +773,9 @@ class DeepSpeedEngine(Module):
            **kwargs: variable length keyword arguments
        """

+        if self.module.training and self.progressive_layer_drop:
+            kwargs.update(self.progressive_layer_drop.get_state())
+
        if self.wall_clock_breakdown():
            self.timers('forward_microstep').start()
            self.timers('forward').start()
@@ -931,6 +956,9 @@ class DeepSpeedEngine(Module):

        # Update the model when we reach gradient accumulation boundaries
        if self.is_gradient_accumulation_boundary():
+            if self.progressive_layer_drop:
+                self.progressive_layer_drop.update_state(self.global_steps)
+
            self._take_model_step()

        self.tput_timer.stop(report_progress)
@@ -1024,6 +1052,12 @@ class DeepSpeedEngine(Module):
        else:
            return self._get_optimizer_param('betas')

+    def get_pld_theta(self):
+        if self.progressive_layer_drop:
+            return self.progressive_layer_drop.get_theta()
+        else:
+            return None
+
    def _report_progress(self, step):
        lr = self.get_lr()
        mom = self.get_mom()

--- a/deepspeed/runtime/progressive_layer_drop.py
+++ b/deepspeed/runtime/progressive_layer_drop.py
+import numpy as np
+from deepspeed.utils import log_dist
+
+
+class ProgressiveLayerDrop(object):
+    r""" Progressive Layer Dropping (PLD) for model training.
+        This implements the PLD technique for compressed model training
+        from this paper: https://arxiv.org/pdf/2010.13369.pdf
+    Args:
+        theta (float): a hyper-parameter that controls the trade-off between training time and robustness.
+        The lower the theta value, the faster the training speed. Default value: 0.5.
+        gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
+    """
+    def __init__(self, theta=0.5, gamma=0.001):
+        super().__init__()
+
+        self.theta = theta
+        self.gamma = gamma
+        self.current_theta = 1.0
+        log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
+
+    def get_state(self):
+        kwargs = {'progressive_layer_drop': True, 'pld_theta': self.get_theta()}
+        return kwargs
+
+    def get_theta(self):
+        return self.current_theta
+
+    def update_state(self, global_step):
+        def _prob(x, gamma, p):
+            return (1. - p) * np.exp(-gamma * x) + p
+
+        self.current_theta = _prob(global_step, self.gamma, self.theta)
--- a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
+++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
--- a/docs/_tutorials/progressive_layer_dropping.md
+++ b/docs/_tutorials/progressive_layer_dropping.md
--- a/docs/index.md
+++ b/docs/index.md
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -101,6 +101,17 @@ class SimpleOptimizer(torch.optim.Optimizer):
        return loss


+class PLD_SimpleModel(SimpleModel):
+    def __init__(self, hidden_dim, empty_grad=False, rank=0):
+        super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad, rank)
+
+    def forward(self, x, y, **kwargs):
+        pld = kwargs.get('progressive_layer_drop', False)
+        theta = kwargs.get('pld_theta', 1.0)
+        hidden_dim = super(PLD_SimpleModel, self).forward(x, y)
+        return hidden_dim
+
+
 def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half):
    batch_size = model.train_micro_batch_size_per_gpu()
    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)

--- a/tests/unit/test_pld.py
+++ b/tests/unit/test_pld.py
+import numpy as np
+import deepspeed
+import pytest
+from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
+from common import distributed_test
+from simple_model import SimpleModel, PLD_SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_schedule(tmpdir, theta):
+    gamma = 0.001
+
+    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
+    for i in range(10):
+        pld_scheduler.update_state(i)
+        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+        actual_theta = pld_scheduler.get_theta()
+        assert expected_theta == actual_theta
+
+
+@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
+def test_pld_model(tmpdir, theta):
+    gamma = 0.001
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = PLD_SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_pld_model(args, model, hidden_dim, theta, gamma):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
+            actual_theta = model.get_pld_theta()
+            assert expected_theta == actual_theta
+
+    _test_pld_model(args=args,
+                    model=model,
+                    hidden_dim=hidden_dim,
+                    theta=theta,
+                    gamma=gamma)
+
+
+def test_non_pld_model(tmpdir):
+    gamma = 0.001
+    theta = 0.5
+    config_dict = {
+        "train_batch_size": 1,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": 'Adam',
+            "params": {
+                "lr": 0.0001
+            }
+        },
+        "fp16": {
+            "enabled": True
+        },
+        "progressive_layer_drop": {
+            "enabled": True,
+            "theta": theta,
+            "gamma": gamma
+        }
+    }
+
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim, empty_grad=False)
+
+    @distributed_test(world_size=[1])
+    def _test_non_pld_model(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=1,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+
+        for i, batch in enumerate(data_loader):
+            with pytest.raises(TypeError):
+                loss = model(batch[0], batch[1])
+
+    _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)