Add profiler statistic (#40249)

* add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage * add profiler code * add statistic code * reduce content for pr

Add profiler statistic (#40249)
* add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage * add profiler code * add statistic code * reduce content for pr
c1d81ec1 · chenjian · GitHub · 3a77d027 · c1d81ec1 · c1d81ec1
2 changed file
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.profiler as profiler
+
+
+class HostPythonNode:
+    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+
+
+class DevicePythonNode:
+    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
+                 stream_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.device_id = device_id
+        self.context_id = context_id
+        self.stream_id = stream_id
+
+
+class TestProfilerStatistic(unittest.TestCase):
+    def test_statistic_case1(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.append(sync_batch_norm_node)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 90)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 55)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 30)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 75)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 15)
+        self.assertEqual(len(event_summary.items), 2)
+        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -16,6 +16,20 @@ from enum import Enum

 from paddle.fluid.core import TracerEventType

+from .statistic_helper import *
+
+_AllTracerEventType = [
+    TracerEventType.Operator, TracerEventType.Dataloader,
+    TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
+    TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
+    TracerEventType.UserDefined, TracerEventType.OperatorInner,
+    TracerEventType.Forward, TracerEventType.Backward,
+    TracerEventType.Optimization, TracerEventType.Communication,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+

 class SortedKeys(Enum):
    r"""
@@ -29,3 +43,782 @@ class SortedKeys(Enum):
    GPUAvg = 5
    GPUMax = 6
    GPUMin = 7
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0
+        self.self_gpu_time = 0
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        for child in self.children_node:
+            self.gpu_time += child.gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+        for device in self.hostnode.device_node:
+            self.gpu_time += (device.end_ns - device.start_ns)
+            self.self_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+class TimeRangeSummary:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        self.CPUTimeRange = collections.defaultdict(list)
+        self.GPUTimeRange = collections.defaultdict(
+            lambda: collections.defaultdict(list)
+        )  # GPU events should be divided into different devices
+        self.CPUTimeRangeSum = collections.defaultdict(int)
+        self.GPUTimeRangeSum = collections.defaultdict(
+            lambda: collections.defaultdict(int))
+        self.call_times = collections.defaultdict(int)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            CPUTimeRange = collections.defaultdict(list)
+            GPUTimeRange = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
+            )  # device_id/type/stream_id
+            for hostnode in hostnodes[1:]:  #skip root node
+                CPUTimeRange[hostnode.type].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.call_times[hostnode.type] += 1
+                if hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in
+                     _CommunicationOpName]):  # special case, communication op
+                    CPUTimeRange[TracerEventType.Communication].append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    self.call_times[TracerEventType.Communication] += 1
+                is_communication_node = (
+                    hostnode.type == TracerEventType.Communication
+                ) or (hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in _CommunicationOpName]))
+                for runtimenode in hostnode.runtime_node:
+                    CPUTimeRange[runtimenode.type].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                    self.call_times[runtimenode.type] += 1
+                    for devicenode in runtimenode.device_node:
+                        GPUTimeRange[devicenode.device_id][devicenode.type][
+                            devicenode.stream_id].append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        self.call_times[devicenode.type] += 1
+                        if is_communication_node:  # gpu activity for communication node
+                            GPUTimeRange[devicenode.device_id][
+                                TracerEventType.Communication][
+                                    devicenode.stream_id].append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                            self.call_times[TracerEventType.Communication] += 1
+
+            for event_type, time_ranges in CPUTimeRange.items():
+                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
+                self.CPUTimeRange[event_type] = merge_ranges(
+                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
+            for device_id, device_time_ranges in GPUTimeRange.items():
+                for event_type, event_time_ranges in device_time_ranges.items():
+                    for stream_id, time_ranges in event_time_ranges.items():
+                        time_ranges = merge_self_ranges(
+                            time_ranges, is_sorted=False)
+                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
+                            self.GPUTimeRange[device_id][event_type],
+                            time_ranges,
+                            is_sorted=True)
+
+        for event_type, time_ranges in self.CPUTimeRange.items():
+            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
+        for device_id, device_time_ranges in self.GPUTimeRange.items():
+            for event_type, time_ranges in device_time_ranges.items():
+                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
+                    time_ranges)
+
+    def get_gpu_devices(self):
+        return self.GPUTimeRange.keys()
+
+    def get_gpu_range_sum(self, device_id, event_type):
+        return self.GPUTimeRangeSum[device_id][event_type]
+
+    def get_cpu_range_sum(self, event_type):
+        return self.CPUTimeRangeSum[event_type]
+
+
+class EventSummary:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    class DeviceItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_item(self, node):
+            self.call += 1
+            self.add_gpu_time(node.end_ns - node.start_ns)
+
+    class OperatorItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.gpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+            self.devices = {}
+            self.operator_inners = {}
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+            for child in node.children_node:
+                if child.name not in self.operator_inners:
+                    self.operator_inners[
+                        child.name] = EventSummary.OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+            for runtimenode in node.runtime_node:
+                for devicenode in runtimenode.device_node:
+                    if devicenode.name not in self.devices:
+                        self.devices[devicenode.name] = EventSummary.DeviceItem(
+                            devicenode.name)
+                    self.devices[devicenode.name].add_item(devicenode)
+
+    class GeneralItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.thread_items = collections.defaultdict(
+            dict)  # for operator summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.userdefined_thread_items = collections.defaultdict(
+            dict)  # for userdefined summary
+        self.model_perspective_items = {}  # for model summary
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  #skip root node
+                if host_statistic_node.type == TracerEventType.Operator:
+                    self.add_operator_item(host_statistic_node)
+                if host_statistic_node.type == TracerEventType.UserDefined\
+                    or host_statistic_node.type == TracerEventType.PythonUserDefined:
+                    if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
+                        or 'memset' in host_statistic_node.name.lower():
+                        self.add_memory_manipulation_item(host_statistic_node)
+                    else:
+                        self.add_userdefined_item(host_statistic_node)
+
+        for threadid, root_statistic_node in node_statistic_trees.items():
+            deque = collections.deque()
+            deque.append(root_statistic_node)
+            while deque:
+                current_node = deque.popleft()
+                for child in current_node.children_node:
+                    if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
+                        or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
+                        self.add_model_perspective_item(
+                            child)  #find first model perspective node
+                    else:
+                        deque.append(child)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = EventSummary.OperatorItem(
+                operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+
+        if operator_node.name not in self.thread_items[operator_node.thread_id]:
+            self.thread_items[operator_node.thread_id][
+                operator_node.name] = EventSummary.OperatorItem(
+                    operator_node.name)
+        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
+            operator_node)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+
+        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
+
+        if userdefined_node.name not in self.userdefined_thread_items[
+                userdefined_node.thread_id]:
+            self.userdefined_thread_items[userdefined_node.thread_id][
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+        self.userdefined_thread_items[userdefined_node.thread_id][
+            userdefined_node.name].add_item(userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = EventSummary.GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == TracerEventType.Forward:
+            name = 'Forward'
+        elif model_perspective_node.type == TracerEventType.Backward:
+            name = 'Backward'
+        elif model_perspective_node.type == TracerEventType.Optimization:
+            name = 'Optimization'
+        elif model_perspective_node.type == TracerEventType.Dataloader:
+            name = 'Dataloader'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
+        self.model_perspective_items[name].add_item(model_perspective_node)
+
+
+class StatisticData:
+    r"""
+    Hold all analysed results.
+    """
+
+    def __init__(self, node_trees, extra_info):
+        self.node_trees = node_trees
+        self.extra_info = extra_info
+        self.time_range_summary = TimeRangeSummary()
+        self.event_summary = EventSummary()
+        self.time_range_summary.parse(node_trees)
+        self.event_summary.parse(node_trees)
+
+
+def _build_table(statistic_data,
+                 sorted_by=SortedKeys.CPUTotal,
+                 op_detail=True,
+                 thread_sep=False,
+                 time_unit='ms',
+                 row_limit=100,
+                 max_src_column_width=75):
+    """Prints a summary of events."""
+    # format table row
+    SPACING_SIZE = 2
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    def add_column(padding, text_dir='<'):
+        row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
+            ' ' * SPACING_SIZE)
+        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
+        line_length_list[0] += padding + SPACING_SIZE
+
+    def add_title(padding, text):
+        left_length = padding - len(text)
+        half = left_length // 2
+        return '-' * half + text + '-' * (left_length - half)
+
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append('\n')
+
+    def format_time(time, unit='ms', indent=0):
+        r"""
+        Transform time in ns to time in unit.
+        """
+        if time == float('inf'):
+            return '-'
+        else:
+            result = float(time)
+            if unit == 's':
+                result /= 1e9
+            elif unit == 'ms':
+                result /= 1e6
+            elif unit == 'us':
+                result /= 1e3
+            return '{}{:.2f}'.format(' ' * indent, result)
+
+    def format_ratio(ratio, indent=0):
+        r"""
+        Transform ratio within [0, 1] to percentage presentation.
+        """
+        return '{}{:.2f}'.format(' ' * indent, ratio * 100)
+
+    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
+        TracerEventType.ProfileStep)
+    ###### Print Device Summary ######
+    headers = ['Device', 'Utilization (%)']
+    name_column_width = 30
+    DEFAULT_COLUMN_WIDTH = 20
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+
+    append(add_title(line_length, "Device Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'CPU(Process)', format_ratio(
+            float(statistic_data.extra_info['Process Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    row_values = [
+        'CPU(System)', format_ratio(
+            float(statistic_data.extra_info['System Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+        gpu_time = float(
+            statistic_data.time_range_summary.get_gpu_range_sum(
+                gpu_name, TracerEventType.Kernel))
+        utilization = gpu_time / total_time
+        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
+        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
+        "GPU Utilization = Current process GPU time / elapsed time")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    if total_time == 0:
+        return ''.join(result)
+
+    ###### Print Overview Summary ######
+    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    DEFAULT_COLUMN_WIDTH = 25
+    for _ in headers:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+    append(add_title(line_length, "Overview Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'Total Time', format_time(
+            total_time, unit=time_unit), format_ratio(1)
+    ]
+    append(row_format.format(*row_values))
+    cpu_type_time = collections.defaultdict(int)
+    gpu_type_time = collections.defaultdict(int)
+    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
+    ):
+        cpu_type_time[event_type] = value
+
+    gpu_time_range = collections.defaultdict(list)
+    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+    ):
+        for event_type, time_range in device_time_ranges.items():
+            gpu_time_range[event_type] = merge_ranges(
+                gpu_time_range[event_type], time_range, is_sorted=True)
+    for event_type, time_range in gpu_time_range.items():
+        gpu_type_time[event_type] = sum_ranges(time_range)
+
+    sorted_items = sorted(
+        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    for event_type, time in sorted_items:
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+    append(header_sep)
+    headers = ['', 'GPU Time', 'Ratio (%)']
+    append(row_format.format(*headers))
+    append(header_sep)
+    for event_type, time in gpu_type_time.items():
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
+        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
+        "ratio = CPU(GPU) Time / Total Time."
+        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
+        "Example:\n"
+        "Thread 1:\n"
+        "  Operator: |___________|     |__________|\n"
+        "Thread 2:\n"
+        "  Operator:   |____________|     |___|\n"
+        "After merged:\n"
+        "  Result:   |______________|  |__________|\n")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    ###### Print Operator Summary Report ######
+    if statistic_data.event_summary.items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 50
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Operator Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            thread_items = statistic_data.event_summary.thread_items
+        else:
+            thread_items = {
+                'All threads merged': statistic_data.event_summary.items
+            }
+        for thread_id, items in thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            total_cpu_time = 0
+            total_gpu_time = 0
+            for name, item in sorted_items:
+                total_cpu_time += item.cpu_time
+                total_gpu_time += item.gpu_time
+            for name, item in sorted_items:
+                row_values = [
+                    name, item.call, '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                ]
+                append(row_format.format(*row_values))
+                if op_detail:
+                    for innerop_name, innerop_node in item.operator_inners.items(
+                    ):
+                        row_values = [
+                            '  {}'.format(innerop_name), innerop_node.call,
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.cpu_time) /
+                                    total_cpu_time)),
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+                        for device_node_name, devicenode in innerop_node.devices.items(
+                        ):
+                            if len(device_node_name) + 4 > name_column_width:
+                                device_node_name = device_node_name[:
+                                                                    name_column_width
+                                                                    - 7]
+                                device_node_name += "..."
+                            row_values = [
+                                '    {}'.format(device_node_name),
+                                devicenode.call, '- / - / - / - / -',
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(
+                                        devicenode.gpu_time, unit=time_unit),
+                                    format_time(
+                                        devicenode.avg_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.max_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.min_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(
+                                        float(devicenode.gpu_time) /
+                                        total_gpu_time))
+                            ]
+                            append(row_format.format(*row_values))
+                    for device_node_name, device_node in item.devices.items():
+                        if len(device_node_name) + 2 > name_column_width:
+                            device_node_name = device_node_name[:
+                                                                name_column_width
+                                                                - 5]
+                            device_node_name += "..."
+                        row_values = [
+                            '    {}'.format(device_node_name), devicenode.call,
+                            '- / - / - / - / -',
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    devicenode.gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(devicenode.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    return ''.join(result)