未验证 提交 43bf035c 编写于 作者: M Michael Wyatt 提交者: GitHub

Update docs to autogenerate pydantic config model docs (#2509)

* update zero config docs
* add autogenerated docs for pydantic models used in ZeRO and Inference configs
上级 b5d18a6a
......@@ -235,27 +235,39 @@ def init_inference(model, config=None, **kwargs):
Description: all four cases are valid and supported in DS init_inference() API.
# Case 1: user provides no config and no kwargs. Default config will be used.
generator.model = deepspeed.init_inference(generator.model)
string = generator("DeepSpeed is")
print(string)
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model)
string = generator("DeepSpeed is")
print(string)
# Case 2: user provides a config and no kwargs. User supplied config will be used.
generator.model = deepspeed.init_inference(generator.model, config=config)
string = generator("DeepSpeed is")
print(string)
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model, config=config)
string = generator("DeepSpeed is")
print(string)
# Case 3: user provides no config and uses keyword arguments (kwargs) only.
generator.model = deepspeed.init_inference(generator.model,
mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model,
mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
# Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
Arguments:
model: Required: original nn.module object without any wrappers
......
......@@ -100,6 +100,23 @@ class DeepSpeedConfigModel(BaseModel):
extra = "forbid"
class pp_int(int):
"""
A wrapper for integers that will return a custom string or comma-formatted
string of the integer. For example, print(pp_int(1e5)) will return
"10,000". This is useful mainly for auto-generated documentation purposes.
"""
def __new__(cls, val, custom_print_str=None):
inst = super().__new__(cls, val)
inst.custom_print_str = custom_print_str
return inst
def __repr__(self):
if self.custom_print_str:
return self.custom_print_str
return f"{self.real:,}"
# adapted from https://stackoverflow.com/a/50701137/9201239
class ScientificNotationEncoder(json.JSONEncoder):
"""
......
......@@ -7,7 +7,7 @@ from pydantic import Field, validator
import sys
from typing import Optional
from enum import Enum
from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigModel
from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
from deepspeed.utils import logger
from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
......@@ -67,6 +67,7 @@ def get_zero_config(param_dict):
class ZeroStageEnum(int, Enum):
""" Enum class for possible zero stages """
disabled = 0
optimizer_states = 1
gradients = 2
......@@ -75,21 +76,86 @@ class ZeroStageEnum(int, Enum):
class DeepSpeedZeroConfig(DeepSpeedConfigModel):
stage: ZeroStageEnum = ZeroStageEnum.disabled
"""
Sets parameters for ZeRO optimizations.
"""
stage: ZeroStageEnum = 0
"""
Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer
to disabled, optimizer state partitioning, and optimizer+gradient state
partitioning, and optimizer+gradient+parameter partitioning, respectively.
"""
contiguous_gradients: bool = True
"""
Copies the gradients to a contiguous buffer as they are produced. Avoids
memory fragmentation during backward pass.
"""
reduce_scatter: bool = True
reduce_bucket_size: int = Field(5e8, ge=0)
"""
Uses reduce or reduce scatter instead of allreduce to average gradients
"""
reduce_bucket_size: int = Field(pp_int(5e8), ge=0)
"""
Number of elements reduced/allreduced at a time. Limits the memory required
for the allgather for large model sizes
"""
allgather_partitions: bool = True
allgather_bucket_size: int = Field(5e8, ge=0)
overlap_comm: bool = None # None for dynamic default value
"""
Chooses between allgather collective or a series of broadcast collectives
to gather updated parameters from all the GPUs at the end of each step
"""
allgather_bucket_size: int = Field(pp_int(5e8), ge=0)
"""
Number of elements allgathered at a time. Limits the memory required for
the allgather for large model sizes
"""
overlap_comm: bool = None # None for dynamic default value (see validator `overlap_comm_valid` below)
"""
Attempts to overlap the reduction of the gradients with backward computation
"""
load_from_fp32_weights: bool = True
"""
Boolean indicating whether to initialize fp32 master weights from fp32
copies in checkpoint (no precision loss) or from model's fp16 copies (with
precision loss). This can be used to initialize optimizer state even when
checkpoint is missing optimizer state.
"""
elastic_checkpoint: bool = False
"""
Enable loading checkpoint that was saved by job with different GPU count.
No longer supported.
"""
# Offload Specific Parameters
offload_param: Optional[DeepSpeedZeroOffloadParamConfig] = None
"""
Enable offloading of model parameters to CPU or NVMe. This frees up GPU
memory for larger models or batch sizes. Valid only with stage 3. Expects a
dictionary containing values for `DeepSpeedZeroOffloadParamConfig`_.
"""
offload_optimizer: Optional[DeepSpeedZeroOffloadOptimizerConfig] = None
sub_group_size: int = Field(1e9, ge=0)
"""
Enable offloading of optimizer state to CPU or NVMe, and optimizer
computation to CPU. This frees up GPU memory for larger models or batch
sizes. Valid for ZeRO stage 1, 2, 3. Expects a dictionary containing values
for `DeepSpeedZeroOffloadOptimizerConfig`_.
"""
sub_group_size: int = Field(pp_int(1e9), ge=0)
"""
Tile size for parameter processing to fit massive models (with trillions of
parameters). Used by ZeRO3-Offload and ZeRO-Infinity
"""
cpu_offload_param: bool = Field(
None,
deprecated=True,
......@@ -98,12 +164,16 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
if val else None),
)
""" Deprecated, please use ``offload_param`` """
cpu_offload_use_pin_memory: bool = Field(
None,
deprecated=True,
new_param="offload_param or offload_optimizer",
set_new_param=False,
)
""" Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
cpu_offload: bool = Field(
None,
deprecated=True,
......@@ -112,29 +182,90 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
if val else None),
)
""" Deprecated, please use ``offload_optimizer`` """
prefetch_bucket_size: int = Field(pp_int(5e7),
ge=0,
alias="stage3_prefetch_bucket_size")
"""
Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
"""
# Stage3 Specific Parameters
prefetch_bucket_size: int = Field(5e7, ge=0, alias="stage3_prefetch_bucket_size")
param_persistence_threshold: int = Field(1e5,
param_persistence_threshold: int = Field(pp_int(1e5),
ge=0,
alias="stage3_param_persistence_threshold")
model_persistence_threshold: int = Field(sys.maxsize,
"""
Do not partition parameters smaller than this threshold. Smaller values use
less memory, but can greatly increase communication (especially
latency-bound messages).
"""
model_persistence_threshold: int = Field(pp_int(sys.maxsize,
"sys.maxsize"),
ge=0,
alias="stage3_model_persistence_threshold")
max_live_parameters: int = Field(1e9, ge=0, alias="stage3_max_live_parameters")
max_reuse_distance: int = Field(1e9, ge=0, alias="stage3_max_reuse_distance")
"""
Maximum number of parameter elements that can be persisted in GPU and not
partitioned. This imposes an upper bound on the number of unpartitioned
parameters resulting from param_persistence_threshold setting. Used by
ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
"""
max_live_parameters: int = Field(pp_int(1e9),
ge=0,
alias="stage3_max_live_parameters")
"""
The maximum number of parameters resident per GPU before releasing. Smaller
values use less memory, but perform more communication.
"""
max_reuse_distance: int = Field(pp_int(1e9), ge=0, alias="stage3_max_reuse_distance")
"""
Do not release a parameter if it will be reused within this threshold of
parameters. Smaller values use less memory, but perform more communication.
"""
gather_16bit_weights_on_model_save: bool = Field(
False,
alias="stage3_gather_16bit_weights_on_model_save")
"""
Consolidate the weights before saving the model by ``save_16bit_model()``.
Since the weights are partitioned across GPUs, they aren’t part of
``state_dict``, so this function automatically gathers the weights when
this option is enabled and then saves the fp16 model weights.
"""
stage3_gather_fp16_weights_on_model_save: bool = Field(
False,
deprecated=True,
new_param="gather_16bit_weights_on_model_save")
""" Deprecated, please use ``gather_16bit_weights_on_model_save`` """
ignore_unused_parameters: bool = True
"""
Unused parameters in modules may be unexpected in static networks, but
could be normal in dynamic networks. This controls whether or not training
should terminate with an error message when unused parameters are detected.
This is set to ``False`` by default, which means unused parameters are
ignored and training continues. Now is just used in stage 2.
"""
legacy_stage1: bool = False
"""
For backward-compatibility enable old ZeRO stage 1 implementation. Use at
your own risk, will be deprecated soon.
"""
round_robin_gradients: bool = False
"""
Stage 1 and 2 optimization for CPU offloading that parallelizes gradient
copying to CPU memory among ranks by fine-grained gradient partitioning.
Performance benefit grows with gradient accumulation steps (more copying
between optimizer steps) or GPU count (increased parallelism).
"""
# Validators
@validator("overlap_comm")
def overlap_comm_valid(cls, field_value, values):
if field_value is None:
......
......@@ -6,33 +6,87 @@ Licensed under the MIT license.
from pydantic import Field, validator
from enum import Enum
from pathlib import Path
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
class OffloadDeviceEnum(str, Enum):
""" Enum for valid offload devices """
none = "none"
cpu = "cpu"
nvme = "nvme"
class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
device: OffloadDeviceEnum = OffloadDeviceEnum.none
""" Set options for parameter offload. Valid only with stage 3. """
device: OffloadDeviceEnum = "none"
"""
Device memory to offload model parameters. Supported options are `cpu` and
`nvme`.
"""
nvme_path: Path = None
""" Filesystem path for NVMe device for parameter offloading. """
buffer_count: int = Field(5, ge=0)
buffer_size: int = Field(1e8, ge=0)
max_in_cpu: int = Field(1e9, ge=0)
""" Number of buffers in buffer pool for parameter offloading to NVMe. """
buffer_size: int = Field(pp_int(1e8), ge=0)
""" Size of buffers in buffer pool for parameter offloading to NVMe. """
max_in_cpu: int = Field(pp_int(1e9), ge=0)
"""
Number of parameter elements to maintain in CPU memory when offloading to
NVMe is enabled.
"""
pin_memory: bool = False
"""
Offload to page-locked CPU memory. This could boost throughput at the cost
of extra memory overhead.
"""
class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
device: OffloadDeviceEnum = OffloadDeviceEnum.none
""" Set options for optimizer offload. Valid with stage 1, 2, and 3. """
device: OffloadDeviceEnum = "none"
"""
Device memory to offload optimizer state. Supported options are `cpu` and
`nvme`. Optimizer computation is offload to CPU regardless of device option.
"""
nvme_path: Path = None
""" Filesystem path for NVMe device for optimizer state offloading. """
buffer_count: int = Field(4, ge=0)
"""
Number of buffers in buffer pool for optimizer state offloading to NVMe.
This should be at least the number of states maintained per parameter by
the optimizer. For example, Adam optimizer has 4 states (parameter,
gradient, momentum, and variance).
"""
pin_memory: bool = False
"""
Offload to page-locked CPU memory. This could boost throughput at the cost
of extra memory overhead.
"""
pipeline_read: bool = False
"""
For tile-based optimizer step processing, overlap read of next tile with
computation of current tile. Used in ZeRO-Infinity.
"""
pipeline_write: bool = False
fast_init: bool = False
"""
For tile-based optimizer step processing, overlap write of previous tile
with computation of current tile.
"""
fast_init: bool = False
""" Enable fast optimizer initialization when offloading to NVMe. """
@validator("pipeline_read", "pipeline_write", always=True)
def set_pipeline(cls, field_value, values):
values["pipeline"] = field_value or values.get("pipeline", False)
......
......@@ -20,7 +20,8 @@ copyright = '2020, Microsoft'
author = 'Microsoft'
# The full version, including alpha/beta/rc tags
release = '0.6'
with open("../../../version.txt", "r") as f:
release = f.readline().rstrip()
master_doc = 'index'
......@@ -37,10 +38,25 @@ extensions = [
'sphinx.ext.viewcode',
'recommonmark',
'sphinx_rtd_theme',
'sphinxcontrib.autodoc_pydantic',
'sphinx.ext.autosectionlabel',
]
pygments_style = 'sphinx'
# autodoc_pyandtic config
autodoc_pydantic_model_show_field_summary = False
autodoc_pydantic_field_signature_prefix = ' '
autodoc_pydantic_model_signature_prefix = 'class'
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_config_summary = False
autodoc_pydantic_model_show_config_member = False
autodoc_pydantic_model_show_validator_summary = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_summary_list_order = 'bysource'
autodoc_pydantic_model_member_order = 'bysource'
autodoc_pydantic_field_list_validators = False
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
......@@ -75,6 +91,6 @@ sys.path.insert(0, os.path.abspath('../../../'))
# Prepend module names to class descriptions?
add_module_names = True
autoclass_content = 'both'
autoclass_content = 'auto'
autodoc_mock_imports = ["apex", "mpi4py", "tensorboardX", "numpy", "cupy"]
......@@ -19,7 +19,7 @@ Training API
training
Inference API
------------
-------------
.. toctree::
:maxdepth: 2
......@@ -64,7 +64,7 @@ Pipeline Parallelism
pipeline
Optimizers
--------------------
----------
.. toctree::
:maxdepth: 2
......
......@@ -6,6 +6,25 @@ Example usage:
.. code-block:: python
engine = deepspeed.init_inference(model=net)
engine = deepspeed.init_inference(model=net, config=config)
The ``DeepSpeedInferenceConfig`` is used to control all aspects of initializing
the ``InferenceEngine``. The config should be passed as a dictionary to
``init_inference``, but parameters can also be passed as keyword arguments.
.. _DeepSpeedInferenceConfig:
.. autopydantic_model:: deepspeed.inference.config.DeepSpeedInferenceConfig
Example config:
.. code-block:: python
config = {
"kernel_inject": True,
"tensor_parallel": {"tp_size": 4},
"dtype": "fp16",
"enable_cuda_graph": False,
"replace_method": "auto",
}
.. autofunction:: deepspeed.init_inference
......@@ -51,6 +51,24 @@ for a complete list of options for configuration and performance tuning.
our `optimizer config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
to instruct :meth:`deepspeed.initialize` to build the optimizer for you.
ZeRO Configurations
===================
All the settings for DeepSpeed ZeRO are set with the `DeepSpeedZeroConfig`_.
The dictionary provided under the ``zero_optimization`` entry of the main
DeepSpeed configuration dict will be parsed and validated with this class.
Sub-configurations for parameter offload and optimzer offload settings are
parsed by `DeepSpeedZeroOffloadParamConfig`_ and
`DeepSpeedZeroOffloadOptimizerConfig`_.
.. _DeepSpeedZeroConfig:
.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroConfig
.. _DeepSpeedZeroOffloadParamConfig:
.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadParamConfig
.. _DeepSpeedZeroOffloadOptimizerConfig:
.. autopydantic_model:: deepspeed.runtime.zero.config.DeepSpeedZeroOffloadOptimizerConfig
Example ZeRO-3 Configurations
......
autodoc_pydantic
docutils<0.18
hjson
packaging
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册