Fix profiler package bug (#40888)

* no * fix bugs * fix doc according to review * fix api doc format * fix api doc according to review * fix bug and add unit test * fix record event bug

Fix profiler package bug (#40888)
* no * fix bugs * fix doc according to review * fix api doc format * fix api doc according to review * fix bug and add unit test * fix record event bug
77a455c7 · chenjian · GitHub · 00448f0e · 77a455c7 · 77a455c7
7 changed file
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -162,7 +162,7 @@ void RecordEvent::OriginalConstruct(const std::string &name,
 void RecordEvent::End() {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
-  if (g_enable_nvprof_hook && is_pushed_) {
+  if (g_enable_nvprof_hook && is_pushed_ && is_enabled_) {
    dynload::nvtxRangePop();
  }
 #endif

--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -20,7 +20,7 @@
 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 2,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
                             "RecordEvent will works "
                             "if host_trace_level >= level.");

--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -207,6 +207,228 @@ class TestProfilerStatistic(unittest.TestCase):
                thread_sep=False,
                time_unit='ms'))
+    def test_statistic_case2(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          100, 110, 1000, 1001)
+        reduce_all_launchkernel0 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 102, 104,
+            1000, 1001)
+        nccl_reduce_all_kernel0 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 105, 120,
+            0, 0, 2)
+        communication_node = HostPythonNode(
+            'Communication', profiler.TracerEventType.Communication, 105, 110,
+            1000, 1001)
+        reduce_all_op1 = HostPythonNode('cudalaunchkernel',
+                                        profiler.TracerEventType.Operator, 105,
+                                        108, 1000, 1001)
+        reduce_all_launchkernel1 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 106, 107,
+            1000, 1001)
+        nccl_reduce_all_kernel1 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 130, 150,
+            0, 0, 2)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 300,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        reduce_all_node2 = HostPythonNode('reduce_all',
+                                          profiler.TracerEventType.Operator,
+                                          230, 250, 1000, 1001)
+        reduce_all_launchkernel2 = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 235, 240,
+            1000, 1001)
+        nccl_reduce_all_kernel2 = DevicePythonNode(
+            'nccl_reduce_all_kernel', profiler.TracerEventType.Kernel, 250, 280,
+            0, 0, 2)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.extend(
+            [sync_batch_norm_node, userdefined_node])
+        userdefined_node.children_node.append(communication_node)
+        userdefined_node.runtime_node.append(reduce_all_launchkernel0)
+        reduce_all_launchkernel0.device_node.append(nccl_reduce_all_kernel0)
+        communication_node.children_node.append(reduce_all_op1)
+        reduce_all_op1.runtime_node.append(reduce_all_launchkernel1)
+        reduce_all_launchkernel1.device_node.append(nccl_reduce_all_kernel1)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        optimization_node.children_node.append(reduce_all_node2)
+        reduce_all_node2.runtime_node.append(reduce_all_launchkernel2)
+        reduce_all_launchkernel2.device_node.append(nccl_reduce_all_kernel2)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+        distributed_summary = statistic_data.distributed_summary
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 100)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 78)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 38)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 220)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 25)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Communication), 5)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.cpu_communication_range), 25)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.gpu_communication_range), 65)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.communication_range), 85)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.computation_range), 220)
+        self.assertEqual(
+            profiler.statistic_helper.sum_ranges(
+                distributed_summary.overlap_range), 85)
+        self.assertEqual(len(event_summary.items), 4)
+        self.assertEqual(len(event_summary.userdefined_items), 1)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 315)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -29,17 +29,17 @@ from .profiler_statistic import StatisticData, _build_table, SortedKeys
 class ProfilerState(Enum):
    r"""
-    Profiler state that can be specified to control profiler action.
+    ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
-    CLOSED: The profilers are closed.
+    The meaning of each ProfilerState is as following
-    READY:  The profilers are open, but the data will not be recorded.
+    - **ProfilerState.CLOSED** : The profiler is closed, and no profiling data will be recorded.
-    This state is used for reducing overhead influence when profilers start.
-    RECORD: The profilers are open, and the data will be recorded.
+    - **ProfilerState.READY** : The profiler is open, but the data will not be recorded. This state is used for reducing overhead influence when profiler starts.
-    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
+    - **ProfilerState.RECORD** : The profiler is open, and the data will be recorded.
-    the collected data will be returned.
+    - **ProfilerState.RECORD_AND_RETURN** : The profiler is open, and this state stands for the last batch of "RECORD" state in current profiling period. The collected data will be returned in this state.
    """
    CLOSED = 0
    READY = 1
@@ -49,11 +49,13 @@ class ProfilerState(Enum):
 class ProfilerTarget(Enum):
    r"""
-    Target device for profiling.
+    ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
+    The meaning of each ProfilerState is as following
-    CPU: Profile events on CPU.
+    - **ProfilerTarget.CPU** : Profile events on CPU.
-    GPU: Profile events on GPU.
+    - **ProfilerTarget.GPU** : Profile events on GPU.
    """
    CPU = 0
    GPU = 1
@@ -66,7 +68,7 @@ def make_scheduler(*,
                   repeat: int=0,
                   skip_first: int=0) -> Callable:
    r"""
-    Return a scheduler function, which scheduler the state according to the setting.
+    Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
    The state transform confirms to:
    .. code-block:: text
@@ -78,22 +80,23 @@ def make_scheduler(*,
                                - - - - - - - - - - - -
        Note that repeat <= 0 means the cycle will continue until the profiler exits.
-    Parameters:
+    Args:
        closed(int): The number of steps in state ProfilerState.CLOSED.
        ready(int):  The number of steps in state ProfilerState.READY.
-        record(int): The number of steps in state ProfilerState.RECORD.
+        record(int): The number of steps in state ProfilerState.RECORD, and the state in last step will be set as ProfilerState.RECORD_AND_RETURN.
-        repeat(int): The number of cycles to repeat above state transform.
+        repeat(int, optional): The number of cycles to repeat above state transform. Default value is 0, which means it will repeat this cycle until profiler exits.
-        skip_first(int): The number of first steps to drop, not participate in the state transform.
+        skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
    Returns:
-        A scheduler function, conforms to above state transform setting.
+        A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
    Examples:
        1. profiling range [2, 5]
-        batch 0: closed, batch 1: ready, batch [2, 5] record
+        Assume batch 0: closed, batch 1: ready, batch [2, 5] record
            .. code-block:: python
+                :name: code-example1
                import paddle.profiler as profiler
                profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
@@ -101,9 +104,10 @@ def make_scheduler(*,
        2. profiling range [3,6], [9,12], [15,18]...
-        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
            .. code-block:: python
+                :name: code-example2
                import paddle.profiler as profiler
                profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
@@ -148,15 +152,21 @@ def export_chrome_tracing(dir_name: str,
                          worker_name: Optional[str]=None) -> Callable:
    r"""
    Return a callable, used for outputing tracing data to chrome tracing format file.
-    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
    if worker_name is not set, the default name is [hostname]_[pid].
-    Parameters:
+    Args:
        dir_name(str): Directory to save profiling data.
-        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+    Returns:
+        A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
    Examples:
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
        .. code-block:: python
+            :name: code-example1
            # required: gpu
            import paddle.profiler as profiler
@@ -192,15 +202,21 @@ def export_chrome_tracing(dir_name: str,
 def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
    r"""
    Return a callable, used for outputing tracing data to protobuf file.
-    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
    if worker_name is not set, the default name is [hostname]_[pid].
-    Parameters:
+    Args:
        dir_name(str): Directory to save profiling data.
-        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+    Returns:
+        A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
    Examples:
+        The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler <api_paddle_profiler_Profiler>` .
        .. code-block:: python
+            :name: code-example1
            # required: gpu
            import paddle.profiler as profiler
@@ -244,20 +260,21 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
 class Profiler:
    r"""
-    Profiler context manager, user interface to manage profile process.
+    Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
-    Parameters:
+    Args:
-        targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
+        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
-        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
+        scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
-            If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+            If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
            which means profiling range [start_batch, end_batch).
-        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
+        on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
    Examples:
-        1. profiling range [2, 5)
+        1. profiling range [2, 5).
            .. code-block:: python
+                :name: code-example1
                # required: gpu
                import paddle.profiler as profiler
@@ -272,6 +289,7 @@ class Profiler:
        2. profiling range [2,4], [7, 9], [11,13]
            .. code-block:: python
+                :name: code-example2
                # required: gpu
                import paddle.profiler as profiler
@@ -286,6 +304,7 @@ class Profiler:
        3. Use profiler without context manager, and use default parameters
            .. code-block:: python
+                :name: code-example3
                # required: gpu
                import paddle.profiler as profiler
@@ -367,6 +386,7 @@ class Profiler:
        Examples:
            .. code-block:: python
+                :name: code-example4
                # required: gpu
                import paddle.profiler as profiler
@@ -401,6 +421,7 @@ class Profiler:
        Examples:
            .. code-block:: python
+                :name: code-example5
                # required: gpu
                import paddle.profiler as profiler
@@ -437,6 +458,7 @@ class Profiler:
        Examples:
            .. code-block:: python
+                :name: code-example6
                # required: gpu
                import paddle.profiler as profiler
@@ -522,10 +544,16 @@ class Profiler:
    def export(self, path="", format="json"):
        r"""
-        Exports the tracing data in Chrome tracing data format.
+        Exports the tracing data to file.
+        Args:
+            path(str): file path of the output.
+            format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
        Examples:
            .. code-block:: python
+                :name: code-example7
                # required: gpu
                import paddle.profiler as profiler
@@ -548,16 +576,17 @@ class Profiler:
                thread_sep=False,
                time_unit='ms'):
        r"""
-        Print the Summary table.
+        Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
-        Parameters:
+        Args:
-            sorted_by(SortedKeys): how to rank the op table items.
+            sorted_by( :ref:`SortedKeys <api_paddle_profiler_SortedKeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
-            op_detail(bool): expand each operator detail information.
+            op_detail(bool, optional): expand each operator detail information, default value is True.
-            thread_sep(bool): print op table each thread.
+            thread_sep(bool, optional): print op table each thread, default value is False.
-            time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
+            time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
        Examples:
            .. code-block:: python
+                :name: code-example8
                # required: gpu
                import paddle.profiler as profiler

--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -33,23 +33,25 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
 class SortedKeys(Enum):
    r"""
-    Sorted keys for printing summary table.
+    SortedKeys is used to specify how to sort items when printing :ref:`summary <api_paddle_profiler_profiler_summary>` table.
-    CPUTotal: Sorted by CPU total time.
+    The meaning of each SortedKeys is as following
-    CPUAvg: Sorted by CPU average time.
+    - **SortedKeys.CPUTotal** :  Sorted by CPU total time.
-    CPUMax: Sorted by CPU max time.
+    - **SortedKeys.CPUAvg**  : Sorted by CPU average time.
-    CPUMin: Sorted by CPU min time.
+    - **SortedKeys.CPUMax**  : Sorted by CPU max time.
-    GPUTotal: Sorted by GPU total time.
+    - **SortedKeys.CPUMin**  : Sorted by CPU min time.
-    GPUAvg: Sorted by GPU average time.
+    - **SortedKeys.GPUTotal**  : Sorted by GPU total time.
-    GPUMax: Sorted by GPU max time.
+    - **SortedKeys.GPUAvg**  : Sorted by GPU average time.
-    GPUMin: Sorted by GPU min time.
+    - **SortedKeys.GPUMax**  : Sorted by GPU max time.
+    - **SortedKeys.GPUMin**  : Sorted by GPU min time.
    """
    CPUTotal = 0
    CPUAvg = 1
@@ -119,6 +121,23 @@ def traverse_tree(nodetrees):
    return results
+def get_device_nodes(hostnode):
+    '''
+    Get all device nodes called in the time range of hostnode.
+    '''
+    stack = []
+    device_nodes = []
+    stack.append(hostnode)
+    while stack:
+        current_node = stack.pop()
+        for childnode in current_node.children_node:
+            stack.append(childnode)
+        for runtimenode in current_node.runtime_node:
+            for devicenode in runtimenode.device_node:
+                device_nodes.append(devicenode)
+    return device_nodes
 def wrap_tree(nodetrees):
    '''
    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
@@ -186,16 +205,6 @@ class TimeRangeSummary:
                CPUTimeRange[hostnode.type].append(
                    (hostnode.start_ns, hostnode.end_ns))
                self.call_times[hostnode.type] += 1
-                if hostnode.type == TracerEventType.Operator and any([
-                        name in hostnode.name for name in _CommunicationOpName
-                ]):  # special case, communication op
-                    CPUTimeRange[TracerEventType.Communication].append(
-                        (hostnode.start_ns, hostnode.end_ns))
-                    self.call_times[TracerEventType.Communication] += 1
-                is_communication_node = (
-                    hostnode.type == TracerEventType.Communication
-                ) or (hostnode.type == TracerEventType.Operator and any(
-                    [name in hostnode.name for name in _CommunicationOpName]))
                for runtimenode in hostnode.runtime_node:
                    CPUTimeRange[runtimenode.type].append(
                        (runtimenode.start_ns, runtimenode.end_ns))
@@ -205,12 +214,6 @@ class TimeRangeSummary:
                            devicenode.stream_id].append(
                                (devicenode.start_ns, devicenode.end_ns))
                        self.call_times[devicenode.type] += 1
-                        if is_communication_node:  # gpu activity for communication node
-                            GPUTimeRange[devicenode.device_id][
-                                TracerEventType.Communication][
-                                    devicenode.stream_id].append((
-                                        devicenode.start_ns, devicenode.end_ns))
-                            self.call_times[TracerEventType.Communication] += 1
            for event_type, time_ranges in CPUTimeRange.items():
                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
@@ -243,6 +246,74 @@ class TimeRangeSummary:
        return self.CPUTimeRangeSum[event_type]
+class DistributedSummary:
+    r"""
+    Analysis communication and computation time range, and their overlap.
+    The computation time is all kernel except kernels for communication like nccl.
+    """
+    def __init__(self):
+        self.cpu_communication_range = []
+        self.gpu_communication_range = []
+        self.communication_range = []
+        self.computation_range = []
+        self.overlap_range = []
+    def parse(self, nodetrees):
+        '''
+        Collect all communication and computation time ranges.
+        '''
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            for hostnode in hostnodes[1:]:  #skip root node
+                # case 1: TracerEventType is Communication
+                if hostnode.type == TracerEventType.Communication:
+                    self.cpu_communication_range.append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    device_nodes = get_device_nodes(hostnode)
+                    for device_node in device_nodes:
+                        if device_node.type == TracerEventType.Kernel:
+                            self.gpu_communication_range.append(
+                                (device_node.start_ns, device_node.end_ns))
+                #case 2: TracerEventType is Operator but is communication op
+                elif hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name.lower()
+                        for name in _CommunicationOpName
+                ]):
+                    self.cpu_communication_range.append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    device_nodes = get_device_nodes(hostnode)
+                    for device_node in device_nodes:
+                        if device_node.type == TracerEventType.Kernel:
+                            self.gpu_communication_range.append(
+                                (device_node.start_ns, device_node.end_ns))
+                #case 3: Others, filter kernels named with nccl
+                else:
+                    for runtimenode in hostnode.runtime_node:
+                        for devicenode in runtimenode.device_node:
+                            if devicenode.type == TracerEventType.Kernel:
+                                if 'nccl' in devicenode.name.lower():
+                                    self.gpu_communication_range.append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                                else:
+                                    self.computation_range.append((
+                                        devicenode.start_ns, devicenode.end_ns))
+        self.cpu_communication_range = merge_self_ranges(
+            self.cpu_communication_range, is_sorted=False)
+        self.gpu_communication_range = merge_self_ranges(
+            self.gpu_communication_range, is_sorted=False)
+        self.communication_range = merge_ranges(
+            self.cpu_communication_range,
+            self.gpu_communication_range,
+            is_sorted=True)
+        self.computation_range = merge_self_ranges(
+            self.computation_range, is_sorted=False)
+        self.overlap_range = intersection_ranges(
+            self.communication_range, self.computation_range, is_sorted=True)
 class EventSummary:
    r"""
    Analyse operator event in profiling data, correlate with its device event.
@@ -473,8 +544,10 @@ class StatisticData:
        self.extra_info = extra_info
        self.time_range_summary = TimeRangeSummary()
        self.event_summary = EventSummary()
+        self.distributed_summary = DistributedSummary()
        self.time_range_summary.parse(node_trees)
        self.event_summary.parse(node_trees)
+        self.distributed_summary.parse(node_trees)
 def _build_table(statistic_data,
@@ -610,7 +683,11 @@ def _build_table(statistic_data,
    gpu_type_time = collections.defaultdict(int)
    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
    ):
+        if event_type != TracerEventType.Communication:
            cpu_type_time[event_type] = value
+    if statistic_data.distributed_summary.cpu_communication_range:
+        cpu_type_time[TracerEventType.Communication] = sum_ranges(
+            statistic_data.distributed_summary.cpu_communication_range)
    gpu_time_range = collections.defaultdict(list)
    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
@@ -620,6 +697,9 @@ def _build_table(statistic_data,
                gpu_time_range[event_type], time_range, is_sorted=True)
    for event_type, time_range in gpu_time_range.items():
        gpu_type_time[event_type] = sum_ranges(time_range)
+    if statistic_data.distributed_summary.gpu_communication_range:
+        gpu_type_time[TracerEventType.Communication] = sum_ranges(
+            statistic_data.distributed_summary.gpu_communication_range)
    sorted_items = sorted(
        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
@@ -735,7 +815,7 @@ def _build_table(statistic_data,
        append('')
    ###### Print Distribution Summary Report ######
-    if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
+    if statistic_data.distributed_summary.communication_range:
        headers = [
            'Name',
            'Total Time',
@@ -759,33 +839,12 @@ def _build_table(statistic_data,
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)
-        cpu_communication_time_range = []
+        communication_time = sum_ranges(
-        gpu_communication_time_range = []
+            statistic_data.distributed_summary.communication_range)
-        cpu_communication_time_range = merge_ranges(
+        computation_time = sum_ranges(
-            statistic_data.time_range_summary.CPUTimeRange[
+            statistic_data.distributed_summary.computation_range)
-                TracerEventType.Communication], cpu_communication_time_range)
+        overlap_time = sum_ranges(
-        kernel_time_range = []
+            statistic_data.distributed_summary.overlap_range)
-        for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
-        ):
-            kernel_time_range = merge_ranges(
-                device_time_ranges[TracerEventType.Kernel],
-                kernel_time_range,
-                is_sorted=True)
-            gpu_communication_time_range = merge_ranges(
-                device_time_ranges[TracerEventType.Communication],
-                gpu_communication_time_range,
-                is_sorted=True)
-        communication_time_range = merge_ranges(
-            cpu_communication_time_range,
-            gpu_communication_time_range,
-            is_sorted=True)
-        computation_time_range = subtract_ranges(kernel_time_range,
-                                                 gpu_communication_time_range)
-        overlap_time_range = intersection_ranges(communication_time_range,
-                                                 computation_time_range)
-        communication_time = sum_ranges(communication_time_range)
-        computation_time = sum_ranges(computation_time_range)
-        overlap_time = sum_ranges(overlap_time_range)
        row_values = [
            'Communication', format_time(
                communication_time, unit=time_unit),
@@ -808,9 +867,9 @@ def _build_table(statistic_data,
        append(row_format.format(*row_values))
        append(header_sep)
        append(
-            "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
+            "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
-            "Computation time: Kernel time, substract kernels belong to communication op.\n"
+            "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
-            "Overlap time: Communication time intersect with computation time.\n"
+            "Overlap time: Communication time intersects with computation time.\n"
            "Example:\n"
            "Communication:\n"
            "  CPU:              |_________________|\n"
@@ -1001,10 +1060,14 @@ def _build_table(statistic_data,
            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
            'GPU Total / Avg / Max / Min / Ratio(%)'
        ]
+        name_column_width = 0
+        for name, item in statistic_data.event_summary.memory_manipulation_items.items(
+        ):
+            name_column_width = max(len(name), name_column_width)
        row_format_list = [""]
        header_sep_list = [""]
        line_length_list = [-SPACING_SIZE]
-        name_column_width = 30
+        name_column_width = min(name_column_width, 45)
        add_column(name_column_width)
        add_column(6)
        add_column(40)
@@ -1059,7 +1122,7 @@ def _build_table(statistic_data,
        row_format_list = [""]
        header_sep_list = [""]
        line_length_list = [-SPACING_SIZE]
-        name_column_width = 30
+        name_column_width = 35
        add_column(name_column_width)
        add_column(6)
        add_column(40)

--- a/python/paddle/profiler/statistic_helper.py
+++ b/python/paddle/profiler/statistic_helper.py
@@ -96,6 +96,7 @@ def merge_ranges(range_list1, range_list2, is_sorted=False):
            else:
                indx1 += 1
        while indx2 < len2:
+            range2 = range_list2[indx2]
            if range2[1] > merged_ranges[-1][1]:
                if range2[0] <= merged_ranges[-1][1]:
                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])

--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -17,8 +17,8 @@ from warnings import warn
 import functools
 from contextlib import ContextDecorator
+from paddle.fluid import core
 from paddle.fluid.core import (_RecordEvent, TracerEventType)
-import paddle.fluid.core as core
 _AllowedEventTypeList = [
    TracerEventType.Dataloader, TracerEventType.ProfileStep,
@@ -30,13 +30,15 @@ _AllowedEventTypeList = [
 class RecordEvent(ContextDecorator):
    r"""
-    Interface for recording a time range.
+    Interface for recording a time range by user defined.
-    Parameters:
+    Args:
        name(str): Name of the record event
+        event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. 
    Examples:
        .. code-block:: python
+            :name: code-example1
            import paddle
            import paddle.profiler as profiler
@@ -53,8 +55,8 @@ class RecordEvent(ContextDecorator):
            result = data1 + data2
            record_event.end()
-    Note:
+    **Note**:
-        RecordEvent will take effect only when profiler is on and at the state of RECORD.
+        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
    """
    def __init__(self,
@@ -76,6 +78,7 @@ class RecordEvent(ContextDecorator):
        Record the time of begining.
        .. code-block:: python
+            :name: code-example2
            import paddle
            import paddle.profiler as profiler
@@ -100,6 +103,7 @@ class RecordEvent(ContextDecorator):
        Record the time of ending.
        .. code-block:: python
+            :name: code-example3
            import paddle
            import paddle.profiler as profiler
@@ -118,14 +122,15 @@ def load_profiler_result(filename: str):
    r"""
    Load dumped profiler data back to memory.
-    Parameters:
+    Args:
        filename(str): Name of the exported protobuf file of profiler data.
    Returns:
-        ProfilerResult object.
+        ProfilerResult object, which stores profiling data.
    Examples:
        .. code-block:: python
+            :name: code-example1
            # required: gpu
            import paddle.profiler as profiler