Update docs to autogenerate pydantic config model docs (#2509)

* update zero config docs * add autogenerated docs for pydantic models used in ZeRO and Inference configs

Update docs to autogenerate pydantic config model docs (#2509)
* update zero config docs * add autogenerated docs for pydantic models used in ZeRO and Inference configs
43bf035c · Michael Wyatt · GitHub · b5d18a6a · 43bf035c · 43bf035c
9 changed file
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -235,27 +235,39 @@ def init_inference(model, config=None, **kwargs):
    Description: all four cases are valid and supported in DS init_inference() API.

    # Case 1: user provides no config and no kwargs. Default config will be used.
-    generator.model = deepspeed.init_inference(generator.model)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model)
+        string = generator("DeepSpeed is")
+        print(string)

    # Case 2: user provides a config and no kwargs. User supplied config will be used.
-    generator.model = deepspeed.init_inference(generator.model, config=config)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model, config=config)
+        string = generator("DeepSpeed is")
+        print(string)

    # Case 3: user provides no config and uses keyword arguments (kwargs) only.
-    generator.model = deepspeed.init_inference(generator.model,
-                                                mp_size=world_size,
-                                                dtype=torch.half,
-                                                replace_with_kernel_inject=True)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model,
+                                                    mp_size=world_size,
+                                                    dtype=torch.half,
+                                                    replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)

    # Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
-    generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
-    string = generator("DeepSpeed is")
-    print(string)
+
+    .. code-block:: python
+
+        generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)

    Arguments:
        model: Required: original nn.module object without any wrappers

--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -100,6 +100,23 @@ class DeepSpeedConfigModel(BaseModel):
        extra = "forbid"


+class pp_int(int):
+    """
+    A wrapper for integers that will return a custom string or comma-formatted
+    string of the integer. For example, print(pp_int(1e5)) will return
+    "10,000". This is useful mainly for auto-generated documentation purposes.
+    """
+    def __new__(cls, val, custom_print_str=None):
+        inst = super().__new__(cls, val)
+        inst.custom_print_str = custom_print_str
+        return inst
+
+    def __repr__(self):
+        if self.custom_print_str:
+            return self.custom_print_str
+        return f"{self.real:,}"
+
+
 # adapted from https://stackoverflow.com/a/50701137/9201239
 class ScientificNotationEncoder(json.JSONEncoder):
    """

--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -7,7 +7,7 @@ from pydantic import Field, validator
 import sys
 from typing import Optional
 from enum import Enum
-from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigModel
+from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum

@@ -67,6 +67,7 @@ def get_zero_config(param_dict):


 class ZeroStageEnum(int, Enum):
+    """ Enum class for possible zero stages """
    disabled = 0
    optimizer_states = 1
    gradients = 2
@@ -75,21 +76,86 @@ class ZeroStageEnum(int, Enum):


 class DeepSpeedZeroConfig(DeepSpeedConfigModel):
-    stage: ZeroStageEnum = ZeroStageEnum.disabled
+    """
+    Sets parameters for ZeRO optimizations.
+    """
+
+    stage: ZeroStageEnum = 0
+    """
+    Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer
+    to disabled, optimizer state partitioning, and optimizer+gradient state
+    partitioning, and optimizer+gradient+parameter partitioning, respectively.
+    """
+
    contiguous_gradients: bool = True
+    """
+    Copies the gradients to a contiguous buffer as they are produced. Avoids
+    memory fragmentation during backward pass.
+    """
+
    reduce_scatter: bool = True
-    reduce_bucket_size: int = Field(5e8, ge=0)
+    """
+    Uses reduce or reduce scatter instead of allreduce to average gradients
+    """
+
+    reduce_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements reduced/allreduced at a time. Limits the memory required
+    for the allgather for large model sizes
+    """
+
    allgather_partitions: bool = True
-    allgather_bucket_size: int = Field(5e8, ge=0)
-    overlap_comm: bool = None  # None for dynamic default value
+    """
+    Chooses between allgather collective or a series of broadcast collectives
+    to gather updated parameters from all the GPUs at the end of each step
+    """
+
+    allgather_bucket_size: int = Field(pp_int(5e8), ge=0)
+    """
+    Number of elements allgathered at a time. Limits the memory required for
+    the allgather for large model sizes
+    """
+
+    overlap_comm: bool = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
+    """
+    Attempts to overlap the reduction of the gradients with backward computation
+    """
+
    load_from_fp32_weights: bool = True
+    """
+    Boolean indicating whether to initialize fp32 master weights from fp32
+    copies in checkpoint (no precision loss) or from model's fp16 copies (with
+    precision loss). This can be used to initialize optimizer state even when
+    checkpoint is missing optimizer state.
+    """

    elastic_checkpoint: bool = False
+    """
+    Enable loading checkpoint that was saved by job with different GPU count.
+    No longer supported.
+    """

-    # Offload Specific Parameters
    offload_param: Optional[DeepSpeedZeroOffloadParamConfig] = None
+    """
+    Enable offloading of model parameters to CPU or NVMe. This frees up GPU
+    memory for larger models or batch sizes. Valid only with stage 3. Expects a
+    dictionary containing values for `DeepSpeedZeroOffloadParamConfig`_.
+    """
+
    offload_optimizer: Optional[DeepSpeedZeroOffloadOptimizerConfig] = None
-    sub_group_size: int = Field(1e9, ge=0)
+    """
+    Enable offloading of optimizer state to CPU or NVMe, and optimizer
+    computation to CPU. This frees up GPU memory for larger models or batch
+    sizes. Valid for ZeRO stage 1, 2, 3. Expects a dictionary containing values
+    for `DeepSpeedZeroOffloadOptimizerConfig`_.
+    """
+
+    sub_group_size: int = Field(pp_int(1e9), ge=0)
+    """
+    Tile size for parameter processing to fit massive models (with trillions of
+    parameters). Used by ZeRO3-Offload and ZeRO-Infinity
+    """
+
    cpu_offload_param: bool = Field(
        None,
        deprecated=True,
@@ -98,12 +164,16 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
            lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
            if val else None),
    )
+    """ Deprecated, please use ``offload_param`` """
+
    cpu_offload_use_pin_memory: bool = Field(
        None,
        deprecated=True,
        new_param="offload_param or offload_optimizer",
        set_new_param=False,
    )
+    """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
+
    cpu_offload: bool = Field(
        None,
        deprecated=True,
@@ -112,29 +182,90 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
            lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
            if val else None),
    )
+    """ Deprecated, please use ``offload_optimizer`` """
+
+    prefetch_bucket_size: int = Field(pp_int(5e7),
+                                      ge=0,
+                                      alias="stage3_prefetch_bucket_size")
+    """
+    Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
+    ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
+    """

-    # Stage3 Specific Parameters
-    prefetch_bucket_size: int = Field(5e7, ge=0, alias="stage3_prefetch_bucket_size")
-    param_persistence_threshold: int = Field(1e5,
+    param_persistence_threshold: int = Field(pp_int(1e5),
                                             ge=0,
                                             alias="stage3_param_persistence_threshold")
-    model_persistence_threshold: int = Field(sys.maxsize,
+    """
+    Do not partition parameters smaller than this threshold. Smaller values use
+    less memory, but can greatly increase communication (especially
+    latency-bound messages).
+    """
+
+    model_persistence_threshold: int = Field(pp_int(sys.maxsize,
+                                                    "sys.maxsize"),
                                             ge=0,
                                             alias="stage3_model_persistence_threshold")
-    max_live_parameters: int = Field(1e9, ge=0, alias="stage3_max_live_parameters")
-    max_reuse_distance: int = Field(1e9, ge=0, alias="stage3_max_reuse_distance")
+    """
+    Maximum number of parameter elements that can be persisted in GPU and not
+    partitioned. This imposes an upper bound on the number of unpartitioned
+    parameters resulting from param_persistence_threshold setting. Used by
+    ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
+    """
+
+    max_live_parameters: int = Field(pp_int(1e9),
+                                     ge=0,
+                                     alias="stage3_max_live_parameters")
+    """
+    The maximum number of parameters resident per GPU before releasing. Smaller
+    values use less memory, but perform more communication.
+    """
+
+    max_reuse_distance: int = Field(pp_int(1e9), ge=0, alias="stage3_max_reuse_distance")
+    """
+    Do not release a parameter if it will be reused within this threshold of
+    parameters. Smaller values use less memory, but perform more communication.
+    """
+
    gather_16bit_weights_on_model_save: bool = Field(
        False,
        alias="stage3_gather_16bit_weights_on_model_save")
+    """
+    Consolidate the weights before saving the model by ``save_16bit_model()``.
+    Since the weights are partitioned across GPUs, they aren’t part of
+    ``state_dict``, so this function automatically gathers the weights when
+    this option is enabled and then saves the fp16 model weights.
+    """
+
    stage3_gather_fp16_weights_on_model_save: bool = Field(
        False,
        deprecated=True,
        new_param="gather_16bit_weights_on_model_save")
+    """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """

    ignore_unused_parameters: bool = True
+    """
+    Unused parameters in modules may be unexpected in static networks, but
+    could be normal in dynamic networks. This controls whether or not training
+    should terminate with an error message when unused parameters are detected.
+    This is set to ``False`` by default, which means unused parameters are
+    ignored and training continues. Now is just used in stage 2.
+    """
+
    legacy_stage1: bool = False
+    """
+    For backward-compatibility enable old ZeRO stage 1 implementation. Use at
+    your own risk, will be deprecated soon.
+    """
+
    round_robin_gradients: bool = False
+    """
+    Stage 1 and 2 optimization for CPU offloading that parallelizes gradient
+    copying to CPU memory among ranks by fine-grained gradient partitioning.
+    Performance benefit grows with gradient accumulation steps (more copying
+    between optimizer steps) or GPU count (increased parallelism).
+    """

+    # Validators
    @validator("overlap_comm")
    def overlap_comm_valid(cls, field_value, values):
        if field_value is None:

--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -6,33 +6,87 @@ Licensed under the MIT license.
 from pydantic import Field, validator
 from enum import Enum
 from pathlib import Path
-from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int


 class OffloadDeviceEnum(str, Enum):
+    """ Enum for valid offload devices """
    none = "none"
    cpu = "cpu"
    nvme = "nvme"


 class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
-    device: OffloadDeviceEnum = OffloadDeviceEnum.none
+    """ Set options for parameter offload. Valid only with stage 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload model parameters. Supported options are `cpu` and
+    `nvme`.
+    """
+
    nvme_path: Path = None
+    """ Filesystem path for NVMe device for parameter offloading. """
+
    buffer_count: int = Field(5, ge=0)
-    buffer_size: int = Field(1e8, ge=0)
-    max_in_cpu: int = Field(1e9, ge=0)
+    """ Number of buffers in buffer pool for parameter offloading to NVMe. """
+
+    buffer_size: int = Field(pp_int(1e8), ge=0)
+    """ Size of buffers in buffer pool for parameter offloading to NVMe. """
+
+    max_in_cpu: int = Field(pp_int(1e9), ge=0)
+    """
+    Number of parameter elements to maintain in CPU memory when offloading to
+    NVMe is enabled.
+    """
+
    pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """


 class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
-    device: OffloadDeviceEnum = OffloadDeviceEnum.none
+    """ Set options for optimizer offload. Valid with stage 1, 2, and 3. """
+
+    device: OffloadDeviceEnum = "none"
+    """
+    Device memory to offload optimizer state. Supported options are `cpu` and
+    `nvme`. Optimizer computation is offload to CPU regardless of device option.
+    """
+
    nvme_path: Path = None
+    """ Filesystem path for NVMe device for optimizer state offloading. """
+
    buffer_count: int = Field(4, ge=0)
+    """
+    Number of buffers in buffer pool for optimizer state offloading to NVMe.
+    This should be at least the number of states maintained per parameter by
+    the optimizer. For example, Adam optimizer has 4 states (parameter,
+    gradient, momentum, and variance).
+    """
+
    pin_memory: bool = False
+    """
+    Offload to page-locked CPU memory. This could boost throughput at the cost
+    of extra memory overhead.
+    """
+
    pipeline_read: bool = False
+    """
+    For tile-based optimizer step processing, overlap read of next tile with
+    computation of current tile. Used in ZeRO-Infinity.
+    """
+
    pipeline_write: bool = False
-    fast_init: bool = False
+    """
+    For tile-based optimizer step processing, overlap write of previous tile
+    with computation of current tile.
+    """

+    fast_init: bool = False
+    """ Enable fast optimizer initialization when offloading to NVMe. """
    @validator("pipeline_read", "pipeline_write", always=True)
    def set_pipeline(cls, field_value, values):
        values["pipeline"] = field_value or values.get("pipeline", False)

--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -20,7 +20,8 @@ copyright = '2020, Microsoft'
 author = 'Microsoft'

 # The full version, including alpha/beta/rc tags
-release = '0.6'
+with open("../../../version.txt", "r") as f:
+    release = f.readline().rstrip()

 master_doc = 'index'

@@ -37,10 +38,25 @@ extensions = [
    'sphinx.ext.viewcode',
    'recommonmark',
    'sphinx_rtd_theme',
+    'sphinxcontrib.autodoc_pydantic',
+    'sphinx.ext.autosectionlabel',
 ]

 pygments_style = 'sphinx'

+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = ' '
+autodoc_pydantic_model_signature_prefix = 'class'
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = 'bysource'
+autodoc_pydantic_model_member_order = 'bysource'
+autodoc_pydantic_field_list_validators = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

@@ -75,6 +91,6 @@ sys.path.insert(0, os.path.abspath('../../../'))
 # Prepend module names to class descriptions?
 add_module_names = True

-autoclass_content = 'both'
+autoclass_content = 'auto'

 autodoc_mock_imports = ["apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -19,7 +19,7 @@ Training API
   training

 Inference API
------------
+-------------

 .. toctree::
   :maxdepth: 2
@@ -64,7 +64,7 @@ Pipeline Parallelism
   pipeline

 Optimizers
--------------------
+----------
 .. toctree::
   :maxdepth: 2


--- a/docs/code-docs/source/inference-init.rst
+++ b/docs/code-docs/source/inference-init.rst
@@ -6,6 +6,25 @@ Example usage:

 .. code-block:: python

-    engine = deepspeed.init_inference(model=net)
+    engine = deepspeed.init_inference(model=net, config=config)
+
+The ``DeepSpeedInferenceConfig`` is used to control all aspects of initializing
+the ``InferenceEngine``. The config should be passed as a dictionary to
+``init_inference``, but parameters can also be passed as keyword arguments.
+
+.. _DeepSpeedInferenceConfig:
+.. autopydantic_model:: deepspeed.inference.config.DeepSpeedInferenceConfig
+
+Example config:
+
+.. code-block:: python
+
+    config = {
+	"kernel_inject": True,
+	"tensor_parallel": {"tp_size": 4},
+	"dtype": "fp16",
+	"enable_cuda_graph": False,
+	"replace_method": "auto",
+    }

 .. autofunction:: deepspeed.init_inference
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -51,6 +51,24 @@ for a complete list of options for configuration and performance tuning.
        our `optimizer config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
        to instruct :meth:`deepspeed.initialize` to build the optimizer for you.

+ZeRO Configurations
+===================
+
+All the settings for DeepSpeed ZeRO are set with the `DeepSpeedZeroConfig`_.
+The dictionary provided under the ``zero_optimization`` entry of the main
+DeepSpeed configuration dict will be parsed and validated with this class.
+Sub-configurations for parameter offload and optimzer offload settings are
+parsed by `DeepSpeedZeroOffloadParamConfig`_ and
+`DeepSpeedZeroOffloadOptimizerConfig`_.
+
+.. _DeepSpeedZeroConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroConfig
+
+.. _DeepSpeedZeroOffloadParamConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadParamConfig
+
+.. _DeepSpeedZeroOffloadOptimizerConfig:
+.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadOptimizerConfig


 Example ZeRO-3 Configurations

--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
+autodoc_pydantic
 docutils<0.18
 hjson
 packaging