From c82756cd1571acde62916554446c5a6b324b5f20 Mon Sep 17 00:00:00 2001
From: Shaden Smith <Shaden.Smith@microsoft.com>
Date: Thu, 10 Sep 2020 15:44:47 -0700
Subject: [PATCH] readthedocs upgrade (#402)

---
 .gitignore                                    |   1 +
 deepspeed/runtime/engine.py                   |   7 +
 deepspeed/runtime/pipe/engine.py              |  67 ++++++---
 deepspeed/runtime/pipe/module.py              |  63 ++++----
 docs/code-docs/source/conf.py                 |   2 +-
 docs/code-docs/source/deepspeed.pt.rst        | 134 ------------------
 docs/code-docs/source/deepspeed.rst           |  34 +----
 docs/code-docs/source/index.rst               |   1 -
 docs/code-docs/source/model-checkpointing.rst |   4 +-
 docs/code-docs/source/modules.rst             |   7 -
 docs/code-docs/source/pipeline-extending.rst  |   5 -
 docs/code-docs/source/pipeline.rst            |  18 ++-
 docs/code-docs/source/training.rst            |  15 +-
 13 files changed, 125 insertions(+), 233 deletions(-)
 delete mode 100644 docs/code-docs/source/deepspeed.pt.rst
 delete mode 100644 docs/code-docs/source/modules.rst
 delete mode 100644 docs/code-docs/source/pipeline-extending.rst

diff --git a/.gitignore b/.gitignore
index 8bf6f1d2..8a383468 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ deepspeed.egg-info/
 # Website
 docs/_site/
 docs/build
+docs/code-docs/source/_build
 docs/code-docs/_build
 docs/code-docs/build
 .sass-cache/
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8b293e86..0b24595b 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -841,6 +841,13 @@ class DeepSpeedEngine(Module):
         return loss
 
     def is_gradient_accumulation_boundary(self):
+        """Query whether the current micro-batch is at the boundary of
+        gradient accumulation, and thus will trigger gradient reductions and
+        an optimizer step.
+
+        Returns:
+            bool: if the current step is a gradient accumulation boundary.
+        """
         return (self.micro_steps + 1) % \
             self.gradient_accumulation_steps() == 0
 
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 27749d48..fc6cf8cc 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -43,10 +43,10 @@ def _tensor_bytes(tensor):
 
 
 class PipelineEngine(DeepSpeedEngine):
-    """ A model wrapper for pipeline-parallel execution.
+    """ A training engine hybrid pipeline, data, and model parallel training.
 
-    Parallelism is achieved by executing micro-batches in a pipelined fashion with
-    gradient accumulation.
+    This engine is created by ``deepspeed.initialize()`` when a :class:`PipelineModule`
+    is provided.
     """
     def __init__(self, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
@@ -227,10 +227,28 @@ class PipelineEngine(DeepSpeedEngine):
         self.num_pipe_buffers = num_buffers
 
     def train_batch(self, data_iter=None):
-        """Progress the pipeline to train the next batch of data.
+        """Progress the pipeline to train the next batch of data. The engine will ingest
+        ``self.train_batch_size()`` total samples collectively across all workers.
+
+
+        An iterator that over training data should be provided as an argument
+        unless ``deepspeed.initialize()`` was provided a training set. In that event,
+        the training data will automatically be read.
+
+
+        .. warning::
+            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
+            from ``data_iter`` by each pipeline. There must be sufficient
+            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
+
+            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
+            that wraps data loaders to automatically restart upon a ``StopIteration``.
+
+        Args:
+            data_iter (Iterator, optional): Iterator of training data.
 
         Returns:
-            The arithmetic mean of the losses over all micro-batches.
+            The arithmetic mean of the losses computed this batch.
         """
         if not torch._C.is_grad_enabled():
             raise RuntimeError(
@@ -286,7 +304,9 @@ class PipelineEngine(DeepSpeedEngine):
         return self.agg_train_loss
 
     def eval_batch(self, data_iter):
-        """Evaluate the pipeline on a batch of data from ``data_iter``.
+        """Evaluate the pipeline on a batch of data from ``data_iter``. The
+        engine will evaluate ``self.train_batch_size()`` total samples
+        collectively across all workers.
 
         This method is equivalent to:
 
@@ -296,9 +316,21 @@ class PipelineEngine(DeepSpeedEngine):
             with torch.no_grad():
                 output = module(batch)
 
+        .. warning::
+            A total of ``self.gradient_accumulation_steps()`` entries will be pulled
+            from ``data_iter`` by each pipeline. There must be sufficient
+            data left in ``data_iter`` or else a ``StopIteration`` will halt training.
+
+            DeepSpeed provides a convenience class :class:`deepspeed.utils.RepeatingLoader`
+            that wraps data loaders to automatically restart upon a ``StopIteration``.
+
+        Args:
+            data_iter (Iterator): Iterator of data to evaluate.
+
         Returns:
-            The arithmetic mean of the losses over all micro-batches.
+            The arithmetic mean of the losses computed this batch.
         """
+
         self.module.eval()
         self.total_loss = None
 
@@ -331,6 +363,14 @@ class PipelineEngine(DeepSpeedEngine):
 
         return self.agg_eval_loss
 
+    def is_first_stage(self):
+        """True if this process is in the first stage in the pipeline."""
+        return self.stage_id == 0
+
+    def is_last_stage(self):
+        """True if this process is in the last stage in the pipeline."""
+        return self.stage_id == self.num_stages - 1
+
     def _aggregate_total_loss(self):
         # Scale loss, average among DP ranks, and bcast loss to the rest of my DP group
         if self.is_last_stage():
@@ -364,7 +404,7 @@ class PipelineEngine(DeepSpeedEngine):
         return agg_loss
 
     def set_dataloader(self, loader):
-        """ Store a DataLoader to sample for training data. """
+        """"""
         if self.is_first_stage() or self.is_last_stage():
             self.training_dataloader = loader
             self.data_iterator = iter(self.training_dataloader)
@@ -993,12 +1033,15 @@ class PipelineEngine(DeepSpeedEngine):
         return buffers
 
     def forward(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
         raise PipelineError("Only train_batch() is accessible in pipeline mode.")
 
     def backward(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
         raise PipelineError("Only train_batch() is accessible in pipeline mode.")
 
     def step(self, *args, **kwargs):
+        """Disabled for pipeline parallel training. See ``train_batch()``. """
         raise PipelineError("Only train_batch() is accessible in pipeline mode.")
 
     def mem_status(self, msg, print_rank=-1, reset_max=False):
@@ -1084,14 +1127,6 @@ class PipelineEngine(DeepSpeedEngine):
 
         self.module.load_state_dir(state_dict, strict=strict)
 
-    def is_first_stage(self):
-        """True if this process is in the first stage in the pipeline."""
-        return self.stage_id == 0
-
-    def is_last_stage(self):
-        """True if this process is in the last stage in the pipeline."""
-        return self.stage_id == self.num_stages - 1
-
     # A map of PipeInstruction types to methods. Each method will be executed with the
     # kwargs provided to the PipeInstruction from the scheduler.
     _INSTRUCTION_MAP = {
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 0b8119f3..6d24ed46 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -26,6 +26,8 @@ class LayerSpec:
     LayerSpec stores the type information and parameters for each stage in a
     PipelineModule. For example:
 
+    .. code-block:: python
+
         nn.Sequence(
             torch.nn.Linear(self.in_dim, self.hidden_dim, bias=False),
             torch.nn.Linear(self.hidden_hidden, self.out_dim)
@@ -33,6 +35,8 @@ class LayerSpec:
 
     becomes
 
+    .. code-block:: python
+
         layer_specs = [
             LayerSpec(torch.nn.Linear, self.in_dim, self.hidden_dim, bias=False),
             LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)]
@@ -79,44 +83,46 @@ class TiedLayerSpec(LayerSpec):
 
 
 class PipelineModule(nn.Module):
-    """Base class for modules to be parallelized with pipeline parallelism.
-
-    Users should subclass PipelineModule and provide layer_specs(), which returns a list
-    of LayerSpec objects. Thes sequence of layers represents the pipeline-parallel model.
-    After initialization, a PipelineModule can be used as a traditional torch.nn.Module.
-
-    The forward pass is already provided by this base class. The key assumption is that
-    the output of each layer can be directly fed as input to the next, like a
-    torch.nn.Sequence.
-
-    The key constraint that enables pipeline parallelism is the representation of the
-    forward pass as a sequence of layers (i.e., stages) and the enforcement of a
-    simple interface between them.
-
-    Example:
-
-    class LinearPipeline(PipelineModule):
-        def __init__(self, in_dim, hidden_dim, out_dim):
-            self.in_dim = in_dim
-            self.hidden_dim = hidden_dim
-            self.out_dim = out_dim
-            super().__init__()
-
-        def layer_specs(self):
-            return [LayerSpec(torch.nn.Linear, self.in_dim, self.hidden_dim, bias=False),
-                    LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)]
-    """
     def __init__(self,
                  layers,
                  num_stages=None,
-                 loss_fn=None,
                  topology=None,
+                 loss_fn=None,
                  seed_layers=False,
                  seed_fn=None,
                  base_seed=1234,
                  partition_method='parameters',
                  activation_checkpoint_interval=0,
                  activation_checkpoint_func=checkpointing.checkpoint):
+        """Modules to be parallelized with pipeline parallelism.
+
+        The key constraint that enables pipeline parallelism is the
+        representation of the forward pass as a sequence of layers
+        and the enforcement of a simple interface between them. The
+        forward pass is implicitly defined by the module ``layers``. The key
+        assumption is that the output of each layer can be directly fed as
+        input to the next, like a ``torch.nn.Sequence``. The forward pass is
+        implicitly:
+
+        .. code-block:: python
+
+            def forward(self, inputs):
+                x = inputs
+                for layer in self.layers:
+                    x = layer(x)
+                return x
+
+        Args:
+            layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
+            num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
+            topology (``deepseed.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``.
+            loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)``
+            base_seed (int, optional): [description]. Defaults to 1234.
+            partition_method (str, optional): [description]. Defaults to 'parameters'.
+            activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
+            activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
+        """
+
         super().__init__()
 
         if num_stages is None and topology is None:
@@ -488,7 +494,6 @@ class PipelineModule(nn.Module):
         self._local_stop = stop
 
     def set_checkpoint_interval(self, interval):
-        """ Checkpoint activations after each ``interval`` layers. Use 0 to disable. """
         assert interval >= 0
         self.checkpoint_interval = interval
 
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index d49496c5..e065c425 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -20,7 +20,7 @@ copyright = '2020, Microsoft'
 author = 'Microsoft'
 
 # The full version, including alpha/beta/rc tags
-release = '0.1.0'
+release = '0.3.0'
 
 master_doc = 'index'
 
diff --git a/docs/code-docs/source/deepspeed.pt.rst b/docs/code-docs/source/deepspeed.pt.rst
deleted file mode 100644
index 991963e3..00000000
--- a/docs/code-docs/source/deepspeed.pt.rst
+++ /dev/null
@@ -1,134 +0,0 @@
-deepspeed.pt package
-====================
-
-Submodules
-----------
-
-deepspeed.pt.deepspeed\_config module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_constants module
-----------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_constants
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_csr\_tensor module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_csr_tensor
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_dataloader module
------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_dataloader
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_fused\_lamb module
-------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_fused_lamb
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_launch module
--------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_launch
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_light module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_light
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_lr\_schedules module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_lr_schedules
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_run module
-----------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_run
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_timer module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_timer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_utils module
-------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.deepspeed\_zero\_optimizer module
-----------------------------------------------
-
-.. automodule:: deepspeed.pt.deepspeed_zero_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_optimizer module
------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.fp16\_unfused\_optimizer module
---------------------------------------------
-
-.. automodule:: deepspeed.pt.fp16_unfused_optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.pt.loss\_scaler module
---------------------------------
-
-.. automodule:: deepspeed.pt.loss_scaler
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: deepspeed.pt
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/code-docs/source/deepspeed.rst b/docs/code-docs/source/deepspeed.rst
index 480793bb..54a37e88 100644
--- a/docs/code-docs/source/deepspeed.rst
+++ b/docs/code-docs/source/deepspeed.rst
@@ -1,36 +1,6 @@
-deepspeed package
-=================
+DeepSpeed
+=========
 
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed.pt
-
-Submodules
-----------
-
-deepspeed.git\_version\_info module
------------------------------------
-
-.. automodule:: deepspeed.git_version_info
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-deepspeed.install\_config module
---------------------------------
-
-.. automodule:: deepspeed.install_config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-
-Module contents
----------------
 
 .. automodule:: deepspeed
    :members:
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 50b15940..faf818c6 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -40,7 +40,6 @@ Pipeline Parallelism
    :maxdepth: 2
 
    pipeline
-   pipeline-extending
 
 
 Indices and tables
diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst
index eaf349b2..064f228f 100644
--- a/docs/code-docs/source/model-checkpointing.rst
+++ b/docs/code-docs/source/model-checkpointing.rst
@@ -5,8 +5,8 @@ DeepSpeed provides routines for checkpointing model state during training.
 
 Loading Training Checkpoints
 ----------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.load_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.load_checkpoint
 
 Saving Training Checkpoints
 ---------------------------
-.. autofunction:: deepspeed.DeepSpeedLight.save_checkpoint
+.. autofunction:: deepspeed.DeepSpeedEngine.save_checkpoint
diff --git a/docs/code-docs/source/modules.rst b/docs/code-docs/source/modules.rst
deleted file mode 100644
index ffb76bdd..00000000
--- a/docs/code-docs/source/modules.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-deepspeed
-=========
-
-.. toctree::
-   :maxdepth: 4
-
-   deepspeed
diff --git a/docs/code-docs/source/pipeline-extending.rst b/docs/code-docs/source/pipeline-extending.rst
deleted file mode 100644
index f9319e31..00000000
--- a/docs/code-docs/source/pipeline-extending.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Extending Pipeline Parallelism
-==============================
-
-.. automodule:: deepspeed.runtime.pipe.schedule
-    :members:
diff --git a/docs/code-docs/source/pipeline.rst b/docs/code-docs/source/pipeline.rst
index 3cc7c183..b82ea05f 100644
--- a/docs/code-docs/source/pipeline.rst
+++ b/docs/code-docs/source/pipeline.rst
@@ -1,8 +1,24 @@
 Pipeline Parallelism
 ====================
 
+Model Specification
+--------------------
+.. autoclass:: deepspeed.pipe.PipelineModule
+    :members:
+
+.. autoclass:: deepspeed.pipe.LayerSpec
+    :members:
+
+.. autoclass:: deepspeed.pipe.TiedLayerSpec
+    :members:
+
+
+Training
+--------
 .. automodule:: deepspeed.runtime.pipe.engine
     :members:
 
-.. automodule:: deepspeed.runtime.pipe.topology
+Extending Pipeline Parallelism
+------------------------------
+.. automodule:: deepspeed.runtime.pipe.schedule
     :members:
diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
index 55ce4642..5a309b55 100644
--- a/docs/code-docs/source/training.rst
+++ b/docs/code-docs/source/training.rst
@@ -1,8 +1,8 @@
 Training API
 ============
 
-:func:`deepspeed.initialize` returns a *model engine* in its first argument
-of type ``DeepSpeedLight``. This engine is used to progress training:
+:func:`deepspeed.initialize` returns a *training engine* in its first argument
+of type :class:`DeepSpeedEngine`. This engine is used to progress training:
 
 .. code-block:: python
 
@@ -18,12 +18,17 @@ of type ``DeepSpeedLight``. This engine is used to progress training:
 
 Forward Propagation
 -------------------
-.. autofunction:: deepspeed.DeepSpeedLight.forward
+.. autofunction:: deepspeed.DeepSpeedEngine.forward
 
 Backward Propagation
 --------------------
-.. autofunction:: deepspeed.DeepSpeedLight.backward
+.. autofunction:: deepspeed.DeepSpeedEngine.backward
 
 Optimizer Step
 --------------
-.. autofunction:: deepspeed.DeepSpeedLight.step
+.. autofunction:: deepspeed.DeepSpeedEngine.step
+
+
+Gradient Accumulation
+---------------------
+.. autofunction:: deepspeed.DeepSpeedEngine.is_gradient_accumulation_boundary
-- 
GitLab