diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py index cb00d0d6be457525234563dc0ede6364ad1adae0..fab292764f9fb3ea3ce6fd4ebe73ba4bf024f110 100644 --- a/docs/code-docs/source/conf.py +++ b/docs/code-docs/source/conf.py @@ -20,7 +20,7 @@ copyright = '2020, Microsoft' author = 'Microsoft' # The full version, including alpha/beta/rc tags -release = '0.3.0' +release = '0.6' master_doc = 'index' diff --git a/docs/code-docs/source/memory.rst b/docs/code-docs/source/memory.rst index 5c92dc199aa4de6002adb6f45a78c01881673ea7..5b29312a0064a53bc2564dcc6c553bd4da3f96b7 100644 --- a/docs/code-docs/source/memory.rst +++ b/docs/code-docs/source/memory.rst @@ -7,9 +7,9 @@ API To Estimate Memory Usage ZeRO2: -.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_live +.. autofunction:: deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_live -.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_cold +.. autofunction:: deepspeed.runtime.zero.stage_1_and_2.estimate_zero2_model_states_mem_needs_all_cold Examples: @@ -18,7 +18,7 @@ Let's try a 3B model with just 1 node with 8 gpus, using live model: .. code-block:: bash python -c 'from transformers import AutoModel; \ - from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_live; \ + from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live; \ model = AutoModel.from_pretrained("t5-3b"); \ estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=8, num_nodes=1)' Estimated memory needed for params, optim states and gradients for a: @@ -34,7 +34,7 @@ faster as we don't need to load the model. .. code-block:: bash - python -c 'from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_cold; \ + python -c 'from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_cold; \ estimate_zero2_model_states_mem_needs_all_cold(total_params=2851e6, num_gpus_per_node=8, num_nodes=1)' Estimated memory needed for params, optim states and gradients for a: HW: Setup with 1 node, 8 GPUs per node. diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt index f3ffe3b615a2433e52b475a82901408b33e9ae2e..83f3b3e39b08a9f9c16c8d93a9444139bb9532cb 100644 --- a/requirements/requirements-readthedocs.txt +++ b/requirements/requirements-readthedocs.txt @@ -1,5 +1,8 @@ docutils<0.18 hjson +packaging psutil +py-cpuinfo +pydantic torch tqdm