Add flops profiler tutorial (#682)

* work on flops profiler tutorial * update flops profiler tutorial * add flops profiler tutorial and fix names * work on flops profiler tutorial * update flops profiler tutorial * add flops profiler tutorial and fix names * fix tailing ws * fix names * remove multistep profiling and update docs * fix cases where functionals and submodules coexist in a parent module, update readme * fix typo * always invoke post hook function * fix module flops sum and update tests * update tutorial

Add flops profiler tutorial (#682)
* work on flops profiler tutorial * update flops profiler tutorial * add flops profiler tutorial and fix names * work on flops profiler tutorial * update flops profiler tutorial * add flops profiler tutorial and fix names * fix tailing ws * fix names * remove multistep profiling and update docs * fix cases where functionals and submodules coexist in a parent module, update readme * fix typo * always invoke post hook function * fix module flops sum and update tests * update tutorial
e2dfe0d1 · Cheng Li · GitHub · 6ee3b296 · e2dfe0d1 · e2dfe0d1
12 changed file
--- a/deepspeed/out2
+++ b/deepspeed/out2
+============================= test session starts ==============================
+platform linux -- Python 3.6.9, pytest-6.0.1, py-1.9.0, pluggy-0.13.1
+rootdir: /home/chengli1/projects/DeepSpeed
+plugins: forked-1.3.0, hypothesis-5.41.3, xdist-2.1.0, cov-2.10.1
+collected 0 items
+
+============================ no tests ran in 0.01s =============================
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -15,8 +15,7 @@ class DeepSpeedFlopsProfilerConfig(object):
        super(DeepSpeedFlopsProfilerConfig, self).__init__()

        self.enabled = None
-        self.start_step = None
-        self.end_step = None
+        self.profile_step = None
        self.module_depth = None
        self.top_modules = None

@@ -35,13 +34,9 @@ class DeepSpeedFlopsProfilerConfig(object):
                                        FLOPS_PROFILER_ENABLED,
                                        FLOPS_PROFILER_ENABLED_DEFAULT)

-        self.start_step = get_scalar_param(flops_profiler_dict,
-                                           FLOPS_PROFILER_START_STEP,
-                                           FLOPS_PROFILER_START_STEP_DEFAULT)
-
-        self.end_step = get_scalar_param(flops_profiler_dict,
-                                         FLOPS_PROFILER_END_STEP,
-                                         FLOPS_PROFILER_END_STEP_DEFAULT)
+        self.profile_step = get_scalar_param(flops_profiler_dict,
+                                             FLOPS_PROFILER_PROFILE_STEP,
+                                             FLOPS_PROFILER_PROFILE_STEP_DEFAULT)

        self.module_depth = get_scalar_param(flops_profiler_dict,
                                             FLOPS_PROFILER_MODULE_DEPTH,
@@ -50,3 +45,7 @@ class DeepSpeedFlopsProfilerConfig(object):
        self.top_modules = get_scalar_param(flops_profiler_dict,
                                            FLOPS_PROFILER_TOP_MODULES,
                                            FLOPS_PROFILER_TOP_MODULES_DEFAULT)
+
+        self.detailed = get_scalar_param(flops_profiler_dict,
+                                         FLOPS_PROFILER_DETAILED,
+                                         FLOPS_PROFILER_DETAILED_DEFAULT)
--- a/deepspeed/profiling/constants.py
+++ b/deepspeed/profiling/constants.py
@@ -12,11 +12,11 @@ FLOPS_PROFILER_FORMAT = '''
 flops profiler should be enabled as:
 "session_params": {
  "flops_profiler": {
-    "enalbe": [true|false],
-    "start_step": 5,
-    "end_step": 6,
+    "enabled": true,
+    "profile_step": 1,
    "module_depth": -1,
    "top_modules": 3,
+    "detailed": true,
    }
 }
 '''
@@ -26,14 +26,14 @@ FLOPS_PROFILER = "flops_profiler"
 FLOPS_PROFILER_ENABLED = "enabled"
 FLOPS_PROFILER_ENABLED_DEFAULT = False

-FLOPS_PROFILER_START_STEP = "start_step"
-FLOPS_PROFILER_START_STEP_DEFAULT = 5
-
-FLOPS_PROFILER_END_STEP = "end_step"
-FLOPS_PROFILER_END_STEP_DEFAULT = FLOPS_PROFILER_START_STEP_DEFAULT + 1
+FLOPS_PROFILER_PROFILE_STEP = "profile_step"
+FLOPS_PROFILER_PROFILE_STEP_DEFAULT = 1

 FLOPS_PROFILER_MODULE_DEPTH = "module_depth"
 FLOPS_PROFILER_MODULE_DEPTH_DEFAULT = -1

 FLOPS_PROFILER_TOP_MODULES = "top_modules"
 FLOPS_PROFILER_TOP_MODULES_DEFAULT = 3
+
+FLOPS_PROFILER_DETAILED = "detailed"
+FLOPS_PROFILER_DETAILED_DEFAULT = True
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -277,11 +277,8 @@ class DeepSpeedEngine(Module):
    def flops_profiler_enabled(self):
        return self._config.flops_profiler_config.enabled

-    def flops_profiler_start_step(self):
-        return self._config.flops_profiler_config.start_step
-
-    def flops_profiler_end_step(self):
-        return self._config.flops_profiler_config.end_step
+    def flops_profiler_profile_step(self):
+        return self._config.flops_profiler_config.profile_step

    def flops_profiler_module_depth(self):
        return self._config.flops_profiler_config.module_depth
@@ -289,6 +286,9 @@ class DeepSpeedEngine(Module):
    def flops_profiler_top_modules(self):
        return self._config.flops_profiler_config.top_modules

+    def flops_profiler_detailed(self):
+        return self._config.flops_profiler_config.detailed
+
    def memory_breakdown(self):
        return self._config.memory_breakdown

@@ -799,30 +799,11 @@ class DeepSpeedEngine(Module):
            **kwargs: variable length keyword arguments
        """
        if self.flops_profiler_enabled(
-        ) and self.global_steps == self.flops_profiler_start_step(
+        ) and self.global_steps == self.flops_profiler_profile_step(
        ) and self.global_rank == 0:
            self.flops_profiler = FlopsProfiler(self.module)
            self.flops_profiler.start_profile(ignore_list=None)

-        if self.flops_profiler_enabled(
-        ) and self.global_steps == self.flops_profiler_end_step(
-        ) and self.global_rank == 0:
-            print('{:<30}  {:<8}'.format(
-                'Number of multiply-adds: ',
-                self.flops_profiler.get_total_flops(in_str=False)))
-            print('{:<30}  {:<8}'.format(
-                'Number of parameters: ',
-                self.flops_profiler.get_total_params(in_str=False)))
-            print('{:<30}  {:<8}'.format('Number of steps profiled: ',
-                                         self.flops_profiler.get_total_steps()))
-            self.flops_profiler.print_model_profile()
-            self.flops_profiler.print_model_aggregated_profile(
-                module_depth=self.flops_profiler_module_depth(),
-                top_modules=self.flops_profiler_top_modules())
-            self.flops_profiler.flops = self.flops_profiler.get_total_flops()
-            self.flops_profiler.params = self.flops_profiler.get_total_params()
-            self.flops_profiler.end_profile()
-
        if self.module.training and self.progressive_layer_drop:
            kwargs.update(self.progressive_layer_drop.get_state())

@@ -838,6 +819,16 @@ class DeepSpeedEngine(Module):
            self.timers('forward').stop()
            self.timers('forward_microstep').stop()

+        if self.flops_profiler_enabled(
+        ) and self.global_steps == self.flops_profiler_profile_step(
+        ) and self.global_rank == 0:
+            self.flops_profiler.print_model_profile(
+                profile_step=self.global_steps,
+                module_depth=self.flops_profiler_module_depth(),
+                top_modules=self.flops_profiler_top_modules(),
+                detailed=self.flops_profiler_detailed())
+            self.flops_profiler.end_profile()
+
        return loss

    def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):

--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -41,6 +41,7 @@ collections:
      - 1Cycle.md
      - lrrt.md
      - zero.md
+      - flops-profiler.md

 defaults:
  - scope:

--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -45,6 +45,8 @@ lnav:
        url: /docs/config-json/#zero-optimizations-for-fp16-training
      - title: "Logging"
        url: /docs/config-json/#logging
+      - title: "Flops Profiler"
+        url: /docs/config-json/#flops-profiler
      - title: "Activation checkpointing"
        url: /docs/config-json/#activation-checkpointing
      - title: "Sparse Attention"
@@ -84,5 +86,7 @@ lnav:
        url: /tutorials/pipeline/
      - title: "Progressive Layer Dropping"
        url: /tutorials/progressive_layer_dropping/
+      - title: "Flops Profiler"
+        url: /tutorials/flops-profiler/
  - title: "Contributing"
    url: /contributing/
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -113,7 +113,7 @@ to contiguous buffers preventing memory fragmentation.

 ## ZeRO-Offload

-ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.  
+ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.

 For more details see the [ZeRO-Offload release blog]( https://www.microsoft.com/en-us/research/?p=689370&secret=iSlooB), and [tutorial](/tutorials/zero-offload/) on integration with DeepSpeed.

@@ -133,7 +133,7 @@ micro-batch, specially when the number of micro-batches per effective batch is l
 During back propagation, DeepSpeed can overlap the communication required for averaging
 parameter gradients that have already been computed with the ongoing gradient computation.
 This computation-communication overlap allows DeepSpeed to achieve higher throughput even
-at modest batch sizes.  
+at modest batch sizes.

 ## Training Features

@@ -240,19 +240,53 @@ comes to data loading. Users simply provide a PyTorch dataset, and DeepSpeed dat
 can automatically handle batch creation appropriately.

 ## Performance Analysis and Debugging
-For performance debugging, DeepSpeed can give you a detailed breakdown of the time spent
-in different parts of the training by simply enabling it in the `deepspeed_config`
-file.
-Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
+
+DeepSpeed provides a set of tools for performance analysis and debugging.
+
+### Wall Clock Breakdown
+
+DeepSpeed provides a detailed breakdown of the time spent
+in different parts of the training.
+This can be enabled by setting the following in the `deepspeed_config` file.
+
 ```json
 {
  "wall_clock_breakdown": true,
+}

+```
+
+###  Timing Activiation Checkpoint Functions
+
+When activiation checkpoingint is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
+
+```json
+{
  "activation_checkpointing": {
    "profile": true
  }
 }
+
 ```
+
+### Flops Profiler
+
+The DeepSpeed flops profiler measures the time, flops and parameters of a PyTorch model and shows which modules or layers are the bottleneck. When used with the DeepSpeed runtime, the flops profiler can be configured in the `deepspeed_config` file as follows:
+
+```json
+{
+  "flops_profiler": {
+    "enabled": true,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 3,
+    "detailed": true,
+    }
+}
+
+```
+The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
+
 ## Sparse Attention
 DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse-attention/) tutorial.


--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
--- a/tests/unit/test_flops_profiler.py
+++ b/tests/unit/test_flops_profiler.py
@@ -24,8 +24,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
        },
        "flops_profiler": {
            "enabled": True,
-            "start_step": 2,
-            "end_step": 3,
+            "step": 1,
            "module_depth": -1,
            "top_modules": 3,
        },
@@ -100,18 +99,17 @@ def test_flops_profiler_in_inference():
    mod = LeNet5(10)
    batch_size = 1024
    input = torch.randn(batch_size, 1, 32, 32)
-    macs, params, steps = get_model_profile(
+    macs, params = get_model_profile(
        mod,
        tuple(input.shape),
        print_profile=True,
-        print_aggregated_profile=True,
+        detailed=True,
        module_depth=-1,
        top_modules=3,
-        warm_up=5,
-        num_steps=10,
-        as_strings=True,
+        warm_up=1,
+        as_string=True,
        ignore_modules=None,
    )
-    print(macs, params, steps)
-    assert macs == "439.55 MMACs"
+    print(macs, params)
+    assert macs == "439.56 MMACs"
    assert params == "61.71 k"