未验证 提交 e2dfe0d1 编写于 作者: C Cheng Li 提交者: GitHub

Add flops profiler tutorial (#682)

* work on flops profiler tutorial

* update flops profiler tutorial

* add flops profiler tutorial and fix names

* work on flops profiler tutorial

* update flops profiler tutorial

* add flops profiler tutorial and fix names

* fix tailing ws

* fix names

* remove multistep profiling and update docs

* fix cases where functionals and submodules coexist in a parent module, update readme

* fix typo

* always invoke post hook function

* fix module flops sum and update tests

* update tutorial
上级 6ee3b296
============================= test session starts ==============================
platform linux -- Python 3.6.9, pytest-6.0.1, py-1.9.0, pluggy-0.13.1
rootdir: /home/chengli1/projects/DeepSpeed
plugins: forked-1.3.0, hypothesis-5.41.3, xdist-2.1.0, cov-2.10.1
collected 0 items
============================ no tests ran in 0.01s =============================
......@@ -15,8 +15,7 @@ class DeepSpeedFlopsProfilerConfig(object):
super(DeepSpeedFlopsProfilerConfig, self).__init__()
self.enabled = None
self.start_step = None
self.end_step = None
self.profile_step = None
self.module_depth = None
self.top_modules = None
......@@ -35,13 +34,9 @@ class DeepSpeedFlopsProfilerConfig(object):
FLOPS_PROFILER_ENABLED,
FLOPS_PROFILER_ENABLED_DEFAULT)
self.start_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_START_STEP,
FLOPS_PROFILER_START_STEP_DEFAULT)
self.end_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_END_STEP,
FLOPS_PROFILER_END_STEP_DEFAULT)
self.profile_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_PROFILE_STEP,
FLOPS_PROFILER_PROFILE_STEP_DEFAULT)
self.module_depth = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_MODULE_DEPTH,
......@@ -50,3 +45,7 @@ class DeepSpeedFlopsProfilerConfig(object):
self.top_modules = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_TOP_MODULES,
FLOPS_PROFILER_TOP_MODULES_DEFAULT)
self.detailed = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_DETAILED,
FLOPS_PROFILER_DETAILED_DEFAULT)
......@@ -12,11 +12,11 @@ FLOPS_PROFILER_FORMAT = '''
flops profiler should be enabled as:
"session_params": {
"flops_profiler": {
"enalbe": [true|false],
"start_step": 5,
"end_step": 6,
"enabled": true,
"profile_step": 1,
"module_depth": -1,
"top_modules": 3,
"detailed": true,
}
}
'''
......@@ -26,14 +26,14 @@ FLOPS_PROFILER = "flops_profiler"
FLOPS_PROFILER_ENABLED = "enabled"
FLOPS_PROFILER_ENABLED_DEFAULT = False
FLOPS_PROFILER_START_STEP = "start_step"
FLOPS_PROFILER_START_STEP_DEFAULT = 5
FLOPS_PROFILER_END_STEP = "end_step"
FLOPS_PROFILER_END_STEP_DEFAULT = FLOPS_PROFILER_START_STEP_DEFAULT + 1
FLOPS_PROFILER_PROFILE_STEP = "profile_step"
FLOPS_PROFILER_PROFILE_STEP_DEFAULT = 1
FLOPS_PROFILER_MODULE_DEPTH = "module_depth"
FLOPS_PROFILER_MODULE_DEPTH_DEFAULT = -1
FLOPS_PROFILER_TOP_MODULES = "top_modules"
FLOPS_PROFILER_TOP_MODULES_DEFAULT = 3
FLOPS_PROFILER_DETAILED = "detailed"
FLOPS_PROFILER_DETAILED_DEFAULT = True
......@@ -277,11 +277,8 @@ class DeepSpeedEngine(Module):
def flops_profiler_enabled(self):
return self._config.flops_profiler_config.enabled
def flops_profiler_start_step(self):
return self._config.flops_profiler_config.start_step
def flops_profiler_end_step(self):
return self._config.flops_profiler_config.end_step
def flops_profiler_profile_step(self):
return self._config.flops_profiler_config.profile_step
def flops_profiler_module_depth(self):
return self._config.flops_profiler_config.module_depth
......@@ -289,6 +286,9 @@ class DeepSpeedEngine(Module):
def flops_profiler_top_modules(self):
return self._config.flops_profiler_config.top_modules
def flops_profiler_detailed(self):
return self._config.flops_profiler_config.detailed
def memory_breakdown(self):
return self._config.memory_breakdown
......@@ -799,30 +799,11 @@ class DeepSpeedEngine(Module):
**kwargs: variable length keyword arguments
"""
if self.flops_profiler_enabled(
) and self.global_steps == self.flops_profiler_start_step(
) and self.global_steps == self.flops_profiler_profile_step(
) and self.global_rank == 0:
self.flops_profiler = FlopsProfiler(self.module)
self.flops_profiler.start_profile(ignore_list=None)
if self.flops_profiler_enabled(
) and self.global_steps == self.flops_profiler_end_step(
) and self.global_rank == 0:
print('{:<30} {:<8}'.format(
'Number of multiply-adds: ',
self.flops_profiler.get_total_flops(in_str=False)))
print('{:<30} {:<8}'.format(
'Number of parameters: ',
self.flops_profiler.get_total_params(in_str=False)))
print('{:<30} {:<8}'.format('Number of steps profiled: ',
self.flops_profiler.get_total_steps()))
self.flops_profiler.print_model_profile()
self.flops_profiler.print_model_aggregated_profile(
module_depth=self.flops_profiler_module_depth(),
top_modules=self.flops_profiler_top_modules())
self.flops_profiler.flops = self.flops_profiler.get_total_flops()
self.flops_profiler.params = self.flops_profiler.get_total_params()
self.flops_profiler.end_profile()
if self.module.training and self.progressive_layer_drop:
kwargs.update(self.progressive_layer_drop.get_state())
......@@ -838,6 +819,16 @@ class DeepSpeedEngine(Module):
self.timers('forward').stop()
self.timers('forward_microstep').stop()
if self.flops_profiler_enabled(
) and self.global_steps == self.flops_profiler_profile_step(
) and self.global_rank == 0:
self.flops_profiler.print_model_profile(
profile_step=self.global_steps,
module_depth=self.flops_profiler_module_depth(),
top_modules=self.flops_profiler_top_modules(),
detailed=self.flops_profiler_detailed())
self.flops_profiler.end_profile()
return loss
def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
......
......@@ -41,6 +41,7 @@ collections:
- 1Cycle.md
- lrrt.md
- zero.md
- flops-profiler.md
defaults:
- scope:
......
......@@ -45,6 +45,8 @@ lnav:
url: /docs/config-json/#zero-optimizations-for-fp16-training
- title: "Logging"
url: /docs/config-json/#logging
- title: "Flops Profiler"
url: /docs/config-json/#flops-profiler
- title: "Activation checkpointing"
url: /docs/config-json/#activation-checkpointing
- title: "Sparse Attention"
......@@ -84,5 +86,7 @@ lnav:
url: /tutorials/pipeline/
- title: "Progressive Layer Dropping"
url: /tutorials/progressive_layer_dropping/
- title: "Flops Profiler"
url: /tutorials/flops-profiler/
- title: "Contributing"
url: /contributing/
此差异已折叠。
......@@ -113,7 +113,7 @@ to contiguous buffers preventing memory fragmentation.
## ZeRO-Offload
ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.
ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.
For more details see the [ZeRO-Offload release blog]( https://www.microsoft.com/en-us/research/?p=689370&secret=iSlooB), and [tutorial](/tutorials/zero-offload/) on integration with DeepSpeed.
......@@ -133,7 +133,7 @@ micro-batch, specially when the number of micro-batches per effective batch is l
During back propagation, DeepSpeed can overlap the communication required for averaging
parameter gradients that have already been computed with the ongoing gradient computation.
This computation-communication overlap allows DeepSpeed to achieve higher throughput even
at modest batch sizes.
at modest batch sizes.
## Training Features
......@@ -240,19 +240,53 @@ comes to data loading. Users simply provide a PyTorch dataset, and DeepSpeed dat
can automatically handle batch creation appropriately.
## Performance Analysis and Debugging
For performance debugging, DeepSpeed can give you a detailed breakdown of the time spent
in different parts of the training by simply enabling it in the `deepspeed_config`
file.
Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
DeepSpeed provides a set of tools for performance analysis and debugging.
### Wall Clock Breakdown
DeepSpeed provides a detailed breakdown of the time spent
in different parts of the training.
This can be enabled by setting the following in the `deepspeed_config` file.
```json
{
"wall_clock_breakdown": true,
}
```
### Timing Activiation Checkpoint Functions
When activiation checkpoingint is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
```json
{
"activation_checkpointing": {
"profile": true
}
}
```
### Flops Profiler
The DeepSpeed flops profiler measures the time, flops and parameters of a PyTorch model and shows which modules or layers are the bottleneck. When used with the DeepSpeed runtime, the flops profiler can be configured in the `deepspeed_config` file as follows:
```json
{
"flops_profiler": {
"enabled": true,
"profile_step": 1,
"module_depth": -1,
"top_modules": 3,
"detailed": true,
}
}
```
The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
## Sparse Attention
DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse-attention/) tutorial.
......
此差异已折叠。
......@@ -24,8 +24,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
},
"flops_profiler": {
"enabled": True,
"start_step": 2,
"end_step": 3,
"step": 1,
"module_depth": -1,
"top_modules": 3,
},
......@@ -100,18 +99,17 @@ def test_flops_profiler_in_inference():
mod = LeNet5(10)
batch_size = 1024
input = torch.randn(batch_size, 1, 32, 32)
macs, params, steps = get_model_profile(
macs, params = get_model_profile(
mod,
tuple(input.shape),
print_profile=True,
print_aggregated_profile=True,
detailed=True,
module_depth=-1,
top_modules=3,
warm_up=5,
num_steps=10,
as_strings=True,
warm_up=1,
as_string=True,
ignore_modules=None,
)
print(macs, params, steps)
assert macs == "439.55 MMACs"
print(macs, params)
assert macs == "439.56 MMACs"
assert params == "61.71 k"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册