[Profiler] add views in summary API (#45225)

* add views in summary api * add args in the last position

[Profiler] add views in summary API (#45225)
* add views in summary api * add args in the last position
194d16c1 · kuizhiqing · GitHub · 1aa6adb1 · 194d16c1 · 194d16c1
3 changed file
--- a/python/paddle/profiler/__init__.py
+++ b/python/paddle/profiler/__init__.py
@@ -15,6 +15,7 @@
 from .profiler import ProfilerState, ProfilerTarget
 from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
 from .profiler import Profiler
+from .profiler import SummaryView
 from .profiler import TracerEventType
 from .utils import RecordEvent, load_profiler_result
 from .profiler_statistic import SortedKeys
@@ -22,5 +23,5 @@ from .profiler_statistic import SortedKeys
 __all__ = [
    'ProfilerState', 'ProfilerTarget', 'make_scheduler',
    'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
-    'load_profiler_result', 'SortedKeys'
+    'load_profiler_result', 'SortedKeys', 'SummaryView'
 ]
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -34,6 +34,22 @@ from paddle.profiler import utils
 from .timer import benchmark
+class SummaryView(Enum):
+    r"""
+    SummaryView define the summary view of different contents.
+    """
+    DeviceView = 0
+    OverView = 1
+    ModelView = 2
+    DistributedView = 3
+    KernelView = 4
+    OperatorView = 5
+    MemoryView = 6
+    MemoryManipulationView = 7
+    UDFView = 8
 class ProfilerState(Enum):
    r"""
    ProfilerState is used to present the state of :ref:`Profiler <api_paddle_profiler_Profiler>` .
@@ -734,7 +750,8 @@ class Profiler:
                sorted_by=SortedKeys.CPUTotal,
                op_detail=True,
                thread_sep=False,
-                time_unit='ms'):
+                time_unit='ms',
+                views=None):
        r"""
        Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
@@ -743,6 +760,7 @@ class Profiler:
            op_detail(bool, optional): expand each operator detail information, default value is True.
            thread_sep(bool, optional): print op table each thread, default value is False.
            time_unit(str, optional): time unit for display, can be chosen form ['s', 'ms', 'us', 'ns'], default value is 'ms'.
+            views(list[SummaryView], optional): summary tables to print, default to None means all views to be printed.
        Examples:
            .. code-block:: python
@@ -770,7 +788,8 @@ class Profiler:
                             sorted_by=sorted_by,
                             op_detail=op_detail,
                             thread_sep=thread_sep,
-                             time_unit=time_unit))
+                             time_unit=time_unit,
+                             views=views))
 def get_profiler(config_path):

--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -700,7 +700,10 @@ def _build_table(statistic_data,
                 thread_sep=False,
                 time_unit='ms',
                 row_limit=100,
-                 max_src_column_width=75):
+                 max_src_column_width=75,
+                 views=None):
+    from .profiler import SummaryView
    """Prints a summary of events."""
    # format table row
    SPACING_SIZE = 2
@@ -749,277 +752,62 @@ def _build_table(statistic_data,
    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
        TracerEventType.ProfileStep)
-    ###### Print Device Summary ######
-    headers = ['Device', 'Utilization (%)']
-    name_column_width = 30
-    DEFAULT_COLUMN_WIDTH = 20
-    add_column(name_column_width)
-    for _ in headers[1:]:
-        add_column(DEFAULT_COLUMN_WIDTH)
-    row_format = row_format_list[0]
-    header_sep = header_sep_list[0]
-    line_length = line_length_list[0]
-    # construct table string
-    append(add_title(line_length, "Device Summary"))
-    append(header_sep)
-    append(row_format.format(*headers))
-    append(header_sep)
-    row_values = [
-        'CPU(Process)',
-        format_ratio(float(
-            statistic_data.extra_info['Process Cpu Utilization']))
-    ]
-    append(row_format.format(*row_values))
-    row_values = [
-        'CPU(System)',
-        format_ratio(float(statistic_data.extra_info['System Cpu Utilization']))
-    ]
-    append(row_format.format(*row_values))
-    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
-        gpu_time = float(
-            statistic_data.time_range_summary.get_gpu_range_sum(
-                gpu_name, TracerEventType.Kernel))
-        utilization = gpu_time / total_time
-        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
-        append(row_format.format(*row_values))
-    append(header_sep)
-    append(
-        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
-        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
-        "GPU Utilization = Current process GPU time / elapsed time.")
-    append('-' * line_length)
-    append('')
-    append('')
-    if total_time == 0:
-        return ''.join(result)
-    ###### Print Overview Summary ######
+    if views is None or SummaryView.DeviceView in views:
-    headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
-    row_format_list = [""]
-    header_sep_list = [""]
-    line_length_list = [-SPACING_SIZE]
-    DEFAULT_COLUMN_WIDTH = 25
+        ###### Print Device Summary ######
-    for _ in headers:
+        headers = ['Device', 'Utilization (%)']
-        add_column(DEFAULT_COLUMN_WIDTH)
+        name_column_width = 30
+        DEFAULT_COLUMN_WIDTH = 20
-    row_format = row_format_list[0]
-    header_sep = header_sep_list[0]
-    line_length = line_length_list[0]
-    # construct table string
-    append(add_title(line_length, "Overview Summary"))
-    append('Time unit: {}'.format(time_unit))
-    append(header_sep)
-    append(row_format.format(*headers))
-    append(header_sep)
-    cpu_type_time = collections.defaultdict(int)
-    gpu_type_time = collections.defaultdict(int)
-    cpu_call_times = collections.defaultdict(int)
-    gpu_call_times = collections.defaultdict(int)
-    cpu_call_times.update(statistic_data.time_range_summary.call_times)
-    gpu_call_times.update(statistic_data.time_range_summary.call_times)
-    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
-    ):
-        if event_type != TracerEventType.Communication:
-            cpu_type_time[event_type] = value
-    if statistic_data.distributed_summary.cpu_communication_range:
-        cpu_type_time[TracerEventType.Communication] = sum_ranges(
-            statistic_data.distributed_summary.cpu_communication_range)
-        cpu_call_times[
-            TracerEventType.
-            Communication] = statistic_data.distributed_summary.cpu_calls
-    for event_type in [
-            TracerEventType.Dataloader, TracerEventType.Forward,
-            TracerEventType.Backward, TracerEventType.Optimization
-    ]:
-        event_type_name = str(event_type).split('.')[1]
-        if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items:
-            cpu_call_times[
-                event_type] = statistic_data.event_summary.model_perspective_items[
-                    event_type_name].call
-            cpu_type_time[
-                event_type] = statistic_data.event_summary.model_perspective_items[
-                    event_type_name].cpu_time
-    gpu_time_range = collections.defaultdict(list)
-    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
-    ):
-        for event_type, time_range in device_time_ranges.items():
-            gpu_time_range[event_type] = merge_ranges(
-                gpu_time_range[event_type], time_range, is_sorted=True)
-    for event_type, time_range in gpu_time_range.items():
-        gpu_type_time[event_type] = sum_ranges(time_range)
-    if statistic_data.distributed_summary.gpu_communication_range:
-        gpu_type_time[TracerEventType.Communication] = sum_ranges(
-            statistic_data.distributed_summary.gpu_communication_range)
-        gpu_call_times[
-            TracerEventType.
-            Communication] = statistic_data.distributed_summary.gpu_calls
-    sorted_items = sorted(cpu_type_time.items(),
-                          key=lambda x: x[1],
-                          reverse=True)
-    event_type, time = sorted_items[0]
-    row_values = [
-        '{}'.format(str(event_type).split('.')[1]), cpu_call_times[event_type],
-        format_time(time, unit=time_unit),
-        format_ratio(float(time) / total_time)
-    ]
-    append(row_format.format(*row_values))
-    for event_type, time in sorted_items[1:]:
-        row_values = [
-            '  {}'.format(str(event_type).split('.')[1]),
-            cpu_call_times[event_type],
-            format_time(time, unit=time_unit),
-            format_ratio(float(time) / total_time)
-        ]
-        append(row_format.format(*row_values))
-    append(header_sep)
-    headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
-    append(row_format.format(*headers))
-    append(header_sep)
-    for event_type, time in gpu_type_time.items():
-        row_values = [
-            '  {}'.format(str(event_type).split('.')[1]),
-            gpu_call_times[event_type],
-            format_time(time, unit=time_unit),
-            format_ratio(float(time) / total_time)
-        ]
-        append(row_format.format(*row_values))
-    append(header_sep)
-    append(
-        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
-        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
-        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
-        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
-        "Example:\n"
-        "Thread 1:\n"
-        "  Operator: |___________|     |__________|\n"
-        "Thread 2:\n"
-        "  Operator:   |____________|     |___|\n"
-        "After merged:\n"
-        "  Result:   |______________|  |__________|\n")
-    append('-' * line_length)
-    append('')
-    append('')
-    ###### Print Model Summary Report ######
-    model_perspective_items = statistic_data.event_summary.model_perspective_items
-    if len(model_perspective_items) > 1:
-        all_row_values = []
-        accmulation_time = 0
-        gpu_accmulation_time = 0
-        gpu_total_time = statistic_data.event_summary.model_perspective_items[
-            'ProfileStep'].gpu_time
-        for name in [
-                'ProfileStep', 'Dataloader', 'Forward', 'Backward',
-                'Optimization'
-        ]:
-            if name in model_perspective_items:
-                item = model_perspective_items[name]
-                if gpu_total_time == 0:
-                    gpu_ratio = 0
-                else:
-                    gpu_ratio = float(item.gpu_time) / gpu_total_time
-                name = '{}'.format(
-                    name) if 'ProfileStep' in name else '  {}'.format(name)
-                row_values = [
-                    '{}'.format(name), item.call,
-                    '{} / {} / {} / {} / {}'.format(
-                        format_time(item.cpu_time, unit=time_unit),
-                        format_time(item.avg_cpu_time, unit=time_unit),
-                        format_time(item.max_cpu_time, unit=time_unit),
-                        format_time(item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_time)),
-                    '{} / {} / {} / {} / {}'.format(
-                        format_time(item.gpu_time, unit=time_unit),
-                        format_time(item.avg_gpu_time, unit=time_unit),
-                        format_time(item.max_gpu_time, unit=time_unit),
-                        format_time(item.min_gpu_time, unit=time_unit),
-                        format_ratio(gpu_ratio))
-                ]
-                all_row_values.append(row_values)
-                if 'ProfileStep' not in name:
-                    accmulation_time += item.cpu_time
-                    gpu_accmulation_time += item.gpu_time
-        other_time = total_time - accmulation_time
-        other_gpu_time = gpu_total_time - gpu_accmulation_time
-        if gpu_total_time == 0:
-            gpu_ratio = 0
-        else:
-            gpu_ratio = float(other_gpu_time) / gpu_total_time
-        row_values = [
-            '  Others', '-', '{} / - / - / - / {}'.format(
-                format_time(other_time, unit=time_unit),
-                format_ratio(float(other_time) / total_time)),
-            '{} / - / - / - / {}'.format(
-                format_time(other_gpu_time, unit=time_unit),
-                format_ratio(gpu_ratio))
-        ]
-        all_row_values.append(row_values)
-        # Calculate the column width
-        calltime_width = 6
-        cpu_data_description_width = 40
-        gpu_data_description_width = 40
-        for row_values in all_row_values:
-            if isinstance(row_values[1],
-                          int) and len(str(row_values[1])) > calltime_width:
-                calltime_width = len(str(row_values[1]))
-            if len(row_values[2]) > cpu_data_description_width:
-                cpu_data_description_width = len(row_values[2])
-            if len(row_values[3]) > gpu_data_description_width:
-                gpu_data_description_width = len(row_values[3])
-        headers = [
-            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-            'GPU Total / Avg / Max / Min / Ratio(%)'
-        ]
-        row_format_list = [""]
-        header_sep_list = [""]
-        line_length_list = [-SPACING_SIZE]
-        name_column_width = 15
        add_column(name_column_width)
-        add_column(calltime_width)
+        for _ in headers[1:]:
-        add_column(cpu_data_description_width)
+            add_column(DEFAULT_COLUMN_WIDTH)
-        add_column(gpu_data_description_width)
        row_format = row_format_list[0]
        header_sep = header_sep_list[0]
        line_length = line_length_list[0]
        # construct table string
-        append(add_title(line_length, "Model Summary"))
-        append('Time unit: {}'.format(time_unit))
+        append(add_title(line_length, "Device Summary"))
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)
-        for row_values in all_row_values:
+        row_values = [
+            'CPU(Process)',
+            format_ratio(
+                float(statistic_data.extra_info['Process Cpu Utilization']))
+        ]
+        append(row_format.format(*row_values))
+        row_values = [
+            'CPU(System)',
+            format_ratio(
+                float(statistic_data.extra_info['System Cpu Utilization']))
+        ]
+        append(row_format.format(*row_values))
+        for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+            gpu_time = float(
+                statistic_data.time_range_summary.get_gpu_range_sum(
+                    gpu_name, TracerEventType.Kernel))
+            utilization = gpu_time / total_time
+            row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
            append(row_format.format(*row_values))
        append(header_sep)
        append(
-            "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
+            "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
-            "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
+            "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
-        )
+            "GPU Utilization = Current process GPU time / elapsed time.")
        append('-' * line_length)
        append('')
        append('')
-    ###### Print Distribution Summary Report ######
+        if total_time == 0:
-    if statistic_data.distributed_summary.communication_range:
+            return ''.join(result)
-        headers = [
-            'Name',
+    if views is None or SummaryView.OverView in views:
-            'Total Time',
+        ###### Print Overview Summary ######
-            'Ratio (%)',
+        headers = ['Event Type', 'Calls', 'CPU Time', 'Ratio (%)']
-        ]
        row_format_list = [""]
        header_sep_list = [""]
        line_length_list = [-SPACING_SIZE]
@@ -1033,188 +821,453 @@ def _build_table(statistic_data,
        line_length = line_length_list[0]
        # construct table string
-        append(add_title(line_length, "Distribution Summary"))
+        append(add_title(line_length, "Overview Summary"))
        append('Time unit: {}'.format(time_unit))
        append(header_sep)
        append(row_format.format(*headers))
        append(header_sep)
-        communication_time = sum_ranges(
+        cpu_type_time = collections.defaultdict(int)
-            statistic_data.distributed_summary.communication_range)
+        gpu_type_time = collections.defaultdict(int)
-        computation_time = sum_ranges(
+        cpu_call_times = collections.defaultdict(int)
-            statistic_data.distributed_summary.computation_range)
+        gpu_call_times = collections.defaultdict(int)
-        overlap_time = sum_ranges(
+        cpu_call_times.update(statistic_data.time_range_summary.call_times)
-            statistic_data.distributed_summary.overlap_range)
+        gpu_call_times.update(statistic_data.time_range_summary.call_times)
-        row_values = [
-            'ProfileStep',
+        for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
-            format_time(total_time, unit=time_unit),
+        ):
-            format_ratio(float(total_time) / total_time)
+            if event_type != TracerEventType.Communication:
-        ]
+                cpu_type_time[event_type] = value
-        append(row_format.format(*row_values))
+        if statistic_data.distributed_summary.cpu_communication_range:
-        row_values = [
+            cpu_type_time[TracerEventType.Communication] = sum_ranges(
-            '  Communication',
+                statistic_data.distributed_summary.cpu_communication_range)
-            format_time(communication_time, unit=time_unit),
+            cpu_call_times[
-            format_ratio(float(communication_time) / total_time)
+                TracerEventType.
-        ]
+                Communication] = statistic_data.distributed_summary.cpu_calls
-        append(row_format.format(*row_values))
+        for event_type in [
+                TracerEventType.Dataloader, TracerEventType.Forward,
+                TracerEventType.Backward, TracerEventType.Optimization
+        ]:
+            event_type_name = str(event_type).split('.')[1]
+            if event_type in cpu_call_times and event_type_name in statistic_data.event_summary.model_perspective_items:
+                cpu_call_times[
+                    event_type] = statistic_data.event_summary.model_perspective_items[
+                        event_type_name].call
+                cpu_type_time[
+                    event_type] = statistic_data.event_summary.model_perspective_items[
+                        event_type_name].cpu_time
+        gpu_time_range = collections.defaultdict(list)
+        for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+        ):
+            for event_type, time_range in device_time_ranges.items():
+                gpu_time_range[event_type] = merge_ranges(
+                    gpu_time_range[event_type], time_range, is_sorted=True)
+        for event_type, time_range in gpu_time_range.items():
+            gpu_type_time[event_type] = sum_ranges(time_range)
+        if statistic_data.distributed_summary.gpu_communication_range:
+            gpu_type_time[TracerEventType.Communication] = sum_ranges(
+                statistic_data.distributed_summary.gpu_communication_range)
+            gpu_call_times[
+                TracerEventType.
+                Communication] = statistic_data.distributed_summary.gpu_calls
+        sorted_items = sorted(cpu_type_time.items(),
+                              key=lambda x: x[1],
+                              reverse=True)
+        event_type, time = sorted_items[0]
        row_values = [
-            '  Computation',
+            '{}'.format(str(event_type).split('.')[1]),
-            format_time(computation_time, unit=time_unit),
+            cpu_call_times[event_type],
-            format_ratio(float(computation_time) / total_time)
+            format_time(time, unit=time_unit),
+            format_ratio(float(time) / total_time)
        ]
        append(row_format.format(*row_values))
+        for event_type, time in sorted_items[1:]:
+            row_values = [
+                '  {}'.format(str(event_type).split('.')[1]),
+                cpu_call_times[event_type],
+                format_time(time, unit=time_unit),
+                format_ratio(float(time) / total_time)
+            ]
+            append(row_format.format(*row_values))
+        append(header_sep)
+        headers = ['', 'Calls', 'GPU Time', 'Ratio (%)']
+        append(row_format.format(*headers))
+        append(header_sep)
+        for event_type, time in gpu_type_time.items():
+            row_values = [
+                '  {}'.format(str(event_type).split('.')[1]),
+                gpu_call_times[event_type],
+                format_time(time, unit=time_unit),
+                format_ratio(float(time) / total_time)
+            ]
+            append(row_format.format(*row_values))
-        row_values = [
-            '  Overlap',
-            format_time(overlap_time, unit=time_unit),
-            format_ratio(float(overlap_time) / total_time)
-        ]
-        append(row_format.format(*row_values))
        append(header_sep)
        append(
-            "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
+            "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
-            "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
+            "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
-            "Overlap time: Communication time intersects with computation time.\n"
+            "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+            "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
            "Example:\n"
-            "Communication:\n"
+            "Thread 1:\n"
-            "  CPU:              |_________________|\n"
+            "  Operator: |___________|     |__________|\n"
-            "  GPU:                                  |______________|\n"
+            "Thread 2:\n"
-            "  Total:            |_________________| |______________|\n"
+            "  Operator:   |____________|     |___|\n"
-            "Computation time(Kernel):\n"
+            "After merged:\n"
-            "  GPU:         |________________|\n"
+            "  Result:   |______________|  |__________|\n")
-            "Overlap time:       |___________|\n")
        append('-' * line_length)
        append('')
        append('')
-    ###### Print Operator Summary Report ######
+    if views is None or SummaryView.ModelView in views:
-    if statistic_data.event_summary.items:
-        all_row_values = []
-        name_column_width = 52
-        if thread_sep == True:
-            thread_items = statistic_data.event_summary.thread_items
-        else:
-            thread_items = {
-                'All threads merged': statistic_data.event_summary.items
-            }
-        for thread_id, items in thread_items.items():
-            all_row_values.append("Thread: {}".format(thread_id))
-            if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].avg_cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].max_cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].min_cpu_time)
-            elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].avg_general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].max_general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].min_general_gpu_time)
-            total_op_cpu_time = 0
-            total_op_gpu_time = 0
-            for name, item in sorted_items:
+        ###### Print Model Summary Report ######
-                total_op_cpu_time += item.cpu_time
+        model_perspective_items = statistic_data.event_summary.model_perspective_items
-                total_op_gpu_time += item.general_gpu_time
+        if len(model_perspective_items) > 1:
+            all_row_values = []
+            accmulation_time = 0
+            gpu_accmulation_time = 0
+            gpu_total_time = statistic_data.event_summary.model_perspective_items[
+                'ProfileStep'].gpu_time
+            for name in [
+                    'ProfileStep', 'Dataloader', 'Forward', 'Backward',
+                    'Optimization'
+            ]:
+                if name in model_perspective_items:
+                    item = model_perspective_items[name]
+                    if gpu_total_time == 0:
+                        gpu_ratio = 0
+                    else:
+                        gpu_ratio = float(item.gpu_time) / gpu_total_time
+                    name = '{}'.format(
+                        name) if 'ProfileStep' in name else '  {}'.format(name)
+                    row_values = [
+                        '{}'.format(name), item.call,
+                        '{} / {} / {} / {} / {}'.format(
+                            format_time(item.cpu_time, unit=time_unit),
+                            format_time(item.avg_cpu_time, unit=time_unit),
+                            format_time(item.max_cpu_time, unit=time_unit),
+                            format_time(item.min_cpu_time, unit=time_unit),
+                            format_ratio(float(item.cpu_time) / total_time)),
+                        '{} / {} / {} / {} / {}'.format(
+                            format_time(item.gpu_time, unit=time_unit),
+                            format_time(item.avg_gpu_time, unit=time_unit),
+                            format_time(item.max_gpu_time, unit=time_unit),
+                            format_time(item.min_gpu_time, unit=time_unit),
+                            format_ratio(gpu_ratio))
+                    ]
+                    all_row_values.append(row_values)
+                    if 'ProfileStep' not in name:
+                        accmulation_time += item.cpu_time
+                        gpu_accmulation_time += item.gpu_time
+            other_time = total_time - accmulation_time
+            other_gpu_time = gpu_total_time - gpu_accmulation_time
+            if gpu_total_time == 0:
+                gpu_ratio = 0
+            else:
+                gpu_ratio = float(other_gpu_time) / gpu_total_time
+            row_values = [
+                '  Others', '-', '{} / - / - / - / {}'.format(
+                    format_time(other_time, unit=time_unit),
+                    format_ratio(float(other_time) / total_time)),
+                '{} / - / - / - / {}'.format(
+                    format_time(other_gpu_time, unit=time_unit),
+                    format_ratio(gpu_ratio))
+            ]
+            all_row_values.append(row_values)
+            # Calculate the column width
+            calltime_width = 6
+            cpu_data_description_width = 40
+            gpu_data_description_width = 40
+            for row_values in all_row_values:
+                if isinstance(row_values[1],
+                              int) and len(str(row_values[1])) > calltime_width:
+                    calltime_width = len(str(row_values[1]))
+                if len(row_values[2]) > cpu_data_description_width:
+                    cpu_data_description_width = len(row_values[2])
+                if len(row_values[3]) > gpu_data_description_width:
+                    gpu_data_description_width = len(row_values[3])
+            headers = [
+                'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+                'GPU Total / Avg / Max / Min / Ratio(%)'
+            ]
+            row_format_list = [""]
+            header_sep_list = [""]
+            line_length_list = [-SPACING_SIZE]
+            name_column_width = 15
+            add_column(name_column_width)
+            add_column(calltime_width)
+            add_column(cpu_data_description_width)
+            add_column(gpu_data_description_width)
-            for name, item in sorted_items:
+            row_format = row_format_list[0]
-                if total_op_cpu_time == 0:
+            header_sep = header_sep_list[0]
-                    cpu_ratio = 0
+            line_length = line_length_list[0]
-                else:
-                    cpu_ratio = float(item.cpu_time) / total_op_cpu_time
+            # construct table string
-                if total_op_gpu_time == 0:
+            append(add_title(line_length, "Model Summary"))
-                    gpu_ratio = 0
+            append('Time unit: {}'.format(time_unit))
-                else:
+            append(header_sep)
-                    gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
+            append(row_format.format(*headers))
-                row_values = [
+            append(header_sep)
-                    name, item.call, '{} / {} / {} / {} / {}'.format(
+            for row_values in all_row_values:
-                        format_time(item.cpu_time, unit=time_unit),
+                append(row_format.format(*row_values))
-                        format_time(item.avg_cpu_time, unit=time_unit),
+            append(header_sep)
-                        format_time(item.max_cpu_time, unit=time_unit),
+            append(
-                        format_time(item.min_cpu_time, unit=time_unit),
+                "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
-                        format_ratio(cpu_ratio)),
+                "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
-                    '{} / {} / {} / {} / {}'.format(
+            )
-                        format_time(item.general_gpu_time, unit=time_unit),
+            append('-' * line_length)
-                        format_time(item.avg_general_gpu_time, unit=time_unit),
+            append('')
-                        format_time(item.max_general_gpu_time, unit=time_unit),
+            append('')
-                        format_time(item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(gpu_ratio))
+    if views is None or SummaryView.DistributedView in views:
-                ]
-                all_row_values.append(row_values)
+        ###### Print Distribution Summary Report ######
-                if op_detail:
+        if statistic_data.distributed_summary.communication_range:
-                    for innerop_name, innerop_node in item.operator_inners.items(
+            headers = [
-                    ):
+                'Name',
-                        if item.cpu_time == 0:
+                'Total Time',
-                            cpu_ratio = 0
+                'Ratio (%)',
-                        else:
+            ]
-                            cpu_ratio = float(
+            row_format_list = [""]
-                                innerop_node.cpu_time) / item.cpu_time
+            header_sep_list = [""]
-                        if item.general_gpu_time == 0:
+            line_length_list = [-SPACING_SIZE]
-                            gpu_ratio = 0
-                        else:
+            DEFAULT_COLUMN_WIDTH = 25
-                            gpu_ratio = float(innerop_node.general_gpu_time
+            for _ in headers:
-                                              ) / item.general_gpu_time
+                add_column(DEFAULT_COLUMN_WIDTH)
-                        if len(innerop_name) + 2 > name_column_width:
-                            innerop_name = innerop_name[:name_column_width - 5]
+            row_format = row_format_list[0]
-                            innerop_name += "..."
+            header_sep = header_sep_list[0]
-                        row_values = [
+            line_length = line_length_list[0]
-                            '  {}'.format(innerop_name), innerop_node.call,
-                            '{} / {} / {} / {} / {}'.format(
+            # construct table string
-                                format_time(innerop_node.cpu_time,
+            append(add_title(line_length, "Distribution Summary"))
-                                            unit=time_unit),
+            append('Time unit: {}'.format(time_unit))
-                                format_time(innerop_node.avg_cpu_time,
+            append(header_sep)
-                                            unit=time_unit),
+            append(row_format.format(*headers))
-                                format_time(innerop_node.max_cpu_time,
+            append(header_sep)
-                                            unit=time_unit),
+            communication_time = sum_ranges(
-                                format_time(innerop_node.min_cpu_time,
+                statistic_data.distributed_summary.communication_range)
-                                            unit=time_unit),
+            computation_time = sum_ranges(
-                                format_ratio(cpu_ratio)),
+                statistic_data.distributed_summary.computation_range)
-                            '{} / {} / {} / {} / {}'.format(
+            overlap_time = sum_ranges(
-                                format_time(innerop_node.general_gpu_time,
+                statistic_data.distributed_summary.overlap_range)
-                                            unit=time_unit),
+            row_values = [
-                                format_time(innerop_node.avg_general_gpu_time,
+                'ProfileStep',
-                                            unit=time_unit),
+                format_time(total_time, unit=time_unit),
-                                format_time(innerop_node.max_general_gpu_time,
+                format_ratio(float(total_time) / total_time)
-                                            unit=time_unit),
+            ]
-                                format_time(innerop_node.min_general_gpu_time,
+            append(row_format.format(*row_values))
-                                            unit=time_unit),
+            row_values = [
-                                format_ratio(gpu_ratio))
+                '  Communication',
-                        ]
+                format_time(communication_time, unit=time_unit),
-                        all_row_values.append(row_values)
+                format_ratio(float(communication_time) / total_time)
-                        for device_node_name, device_node in innerop_node.devices.items(
+            ]
+            append(row_format.format(*row_values))
+            row_values = [
+                '  Computation',
+                format_time(computation_time, unit=time_unit),
+                format_ratio(float(computation_time) / total_time)
+            ]
+            append(row_format.format(*row_values))
+            row_values = [
+                '  Overlap',
+                format_time(overlap_time, unit=time_unit),
+                format_ratio(float(overlap_time) / total_time)
+            ]
+            append(row_format.format(*row_values))
+            append(header_sep)
+            append(
+                "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
+                "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
+                "Overlap time: Communication time intersects with computation time.\n"
+                "Example:\n"
+                "Communication:\n"
+                "  CPU:              |_________________|\n"
+                "  GPU:                                  |______________|\n"
+                "  Total:            |_________________| |______________|\n"
+                "Computation time(Kernel):\n"
+                "  GPU:         |________________|\n"
+                "Overlap time:       |___________|\n")
+            append('-' * line_length)
+            append('')
+            append('')
+    if views is None or SummaryView.OperatorView in views:
+        ###### Print Operator Summary Report ######
+        if statistic_data.event_summary.items:
+            all_row_values = []
+            name_column_width = 52
+            if thread_sep == True:
+                thread_items = statistic_data.event_summary.thread_items
+            else:
+                thread_items = {
+                    'All threads merged': statistic_data.event_summary.items
+                }
+            for thread_id, items in thread_items.items():
+                all_row_values.append("Thread: {}".format(thread_id))
+                if sorted_by == SortedKeys.CPUTotal:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUAvg:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].avg_cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUMax:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].max_cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUMin:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].min_cpu_time)
+                elif sorted_by == SortedKeys.GPUTotal:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].general_gpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.GPUAvg:
+                    sorted_items = sorted(
+                        items.items(),
+                        key=lambda x: x[1].avg_general_gpu_time,
+                        reverse=True)
+                elif sorted_by == SortedKeys.GPUMax:
+                    sorted_items = sorted(
+                        items.items(),
+                        key=lambda x: x[1].max_general_gpu_time,
+                        reverse=True)
+                elif sorted_by == SortedKeys.GPUMin:
+                    sorted_items = sorted(
+                        items.items(), key=lambda x: x[1].min_general_gpu_time)
+                total_op_cpu_time = 0
+                total_op_gpu_time = 0
+                for name, item in sorted_items:
+                    total_op_cpu_time += item.cpu_time
+                    total_op_gpu_time += item.general_gpu_time
+                for name, item in sorted_items:
+                    if total_op_cpu_time == 0:
+                        cpu_ratio = 0
+                    else:
+                        cpu_ratio = float(item.cpu_time) / total_op_cpu_time
+                    if total_op_gpu_time == 0:
+                        gpu_ratio = 0
+                    else:
+                        gpu_ratio = float(
+                            item.general_gpu_time) / total_op_gpu_time
+                    row_values = [
+                        name, item.call, '{} / {} / {} / {} / {}'.format(
+                            format_time(item.cpu_time, unit=time_unit),
+                            format_time(item.avg_cpu_time, unit=time_unit),
+                            format_time(item.max_cpu_time, unit=time_unit),
+                            format_time(item.min_cpu_time, unit=time_unit),
+                            format_ratio(cpu_ratio)),
+                        '{} / {} / {} / {} / {}'.format(
+                            format_time(item.general_gpu_time, unit=time_unit),
+                            format_time(item.avg_general_gpu_time,
+                                        unit=time_unit),
+                            format_time(item.max_general_gpu_time,
+                                        unit=time_unit),
+                            format_time(item.min_general_gpu_time,
+                                        unit=time_unit),
+                            format_ratio(gpu_ratio))
+                    ]
+                    all_row_values.append(row_values)
+                    if op_detail:
+                        for innerop_name, innerop_node in item.operator_inners.items(
+                        ):
+                            if item.cpu_time == 0:
+                                cpu_ratio = 0
+                            else:
+                                cpu_ratio = float(
+                                    innerop_node.cpu_time) / item.cpu_time
+                            if item.general_gpu_time == 0:
+                                gpu_ratio = 0
+                            else:
+                                gpu_ratio = float(innerop_node.general_gpu_time
+                                                  ) / item.general_gpu_time
+                            if len(innerop_name) + 2 > name_column_width:
+                                innerop_name = innerop_name[:name_column_width -
+                                                            5]
+                                innerop_name += "..."
+                            row_values = [
+                                '  {}'.format(innerop_name), innerop_node.call,
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(innerop_node.cpu_time,
+                                                unit=time_unit),
+                                    format_time(innerop_node.avg_cpu_time,
+                                                unit=time_unit),
+                                    format_time(innerop_node.max_cpu_time,
+                                                unit=time_unit),
+                                    format_time(innerop_node.min_cpu_time,
+                                                unit=time_unit),
+                                    format_ratio(cpu_ratio)),
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(innerop_node.general_gpu_time,
+                                                unit=time_unit),
+                                    format_time(
+                                        innerop_node.avg_general_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        innerop_node.max_general_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        innerop_node.min_general_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(gpu_ratio))
+                            ]
+                            all_row_values.append(row_values)
+                            for device_node_name, device_node in innerop_node.devices.items(
+                            ):
+                                if innerop_node.general_gpu_time == 0:
+                                    gpu_ratio = 0
+                                else:
+                                    gpu_ratio = float(
+                                        device_node.gpu_time
+                                    ) / innerop_node.general_gpu_time
+                                if len(device_node_name
+                                       ) + 4 > name_column_width:
+                                    device_node_name = device_node_name[:
+                                                                        name_column_width
+                                                                        - 7]
+                                    device_node_name += "..."
+                                row_values = [
+                                    '    {}'.format(device_node_name),
+                                    device_node.call, '- / - / - / - / -',
+                                    '{} / {} / {} / {} / {}'.format(
+                                        format_time(device_node.gpu_time,
+                                                    unit=time_unit),
+                                        format_time(device_node.avg_gpu_time,
+                                                    unit=time_unit),
+                                        format_time(device_node.max_gpu_time,
+                                                    unit=time_unit),
+                                        format_time(device_node.min_gpu_time,
+                                                    unit=time_unit),
+                                        format_ratio(gpu_ratio))
+                                ]
+                                all_row_values.append(row_values)
+                        for device_node_name, device_node in item.devices.items(
                        ):
-                            if innerop_node.general_gpu_time == 0:
+                            if item.general_gpu_time == 0:
                                gpu_ratio = 0
                            else:
-                                gpu_ratio = float(
+                                gpu_ratio = float(device_node.gpu_time
-                                    device_node.gpu_time
+                                                  ) / item.general_gpu_time
-                                ) / innerop_node.general_gpu_time
+                            if len(device_node_name) + 2 > name_column_width:
-                            if len(device_node_name) + 4 > name_column_width:
                                device_node_name = device_node_name[:
                                                                    name_column_width
-                                                                    - 7]
+                                                                    - 5]
                                device_node_name += "..."
                            row_values = [
-                                '    {}'.format(device_node_name),
+                                '  {}'.format(device_node_name),
                                device_node.call, '- / - / - / - / -',
                                '{} / {} / {} / {} / {}'.format(
                                    format_time(device_node.gpu_time,
@@ -1228,280 +1281,149 @@ def _build_table(statistic_data,
                                    format_ratio(gpu_ratio))
                            ]
                            all_row_values.append(row_values)
-                    for device_node_name, device_node in item.devices.items():
+            # Calculate the column width
-                        if item.general_gpu_time == 0:
+            calltime_width = 6
-                            gpu_ratio = 0
+            cpu_data_description_width = 40
-                        else:
+            gpu_data_description_width = 40
-                            gpu_ratio = float(
+            for row_values in all_row_values:
-                                device_node.gpu_time) / item.general_gpu_time
+                if isinstance(row_values, str):
-                        if len(device_node_name) + 2 > name_column_width:
+                    continue
-                            device_node_name = device_node_name[:
+                if isinstance(row_values[1],
-                                                                name_column_width
+                              int) and len(str(row_values[1])) > calltime_width:
-                                                                - 5]
+                    calltime_width = len(str(row_values[1]))
-                            device_node_name += "..."
+                if len(row_values[2]) > cpu_data_description_width:
-                        row_values = [
+                    cpu_data_description_width = len(row_values[2])
-                            '  {}'.format(device_node_name), device_node.call,
+                if len(row_values[3]) > gpu_data_description_width:
-                            '- / - / - / - / -',
+                    gpu_data_description_width = len(row_values[3])
-                            '{} / {} / {} / {} / {}'.format(
+            headers = [
-                                format_time(device_node.gpu_time,
+                'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-                                            unit=time_unit),
+                'GPU Total / Avg / Max / Min / Ratio(%)'
-                                format_time(device_node.avg_gpu_time,
-                                            unit=time_unit),
-                                format_time(device_node.max_gpu_time,
-                                            unit=time_unit),
-                                format_time(device_node.min_gpu_time,
-                                            unit=time_unit),
-                                format_ratio(gpu_ratio))
-                        ]
-                        all_row_values.append(row_values)
-        # Calculate the column width
-        calltime_width = 6
-        cpu_data_description_width = 40
-        gpu_data_description_width = 40
-        for row_values in all_row_values:
-            if isinstance(row_values, str):
-                continue
-            if isinstance(row_values[1],
-                          int) and len(str(row_values[1])) > calltime_width:
-                calltime_width = len(str(row_values[1]))
-            if len(row_values[2]) > cpu_data_description_width:
-                cpu_data_description_width = len(row_values[2])
-            if len(row_values[3]) > gpu_data_description_width:
-                gpu_data_description_width = len(row_values[3])
-        headers = [
-            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-            'GPU Total / Avg / Max / Min / Ratio(%)'
-        ]
-        row_format_list = [""]
-        header_sep_list = [""]
-        line_length_list = [-SPACING_SIZE]
-        add_column(name_column_width)
-        add_column(calltime_width)
-        add_column(cpu_data_description_width)
-        add_column(gpu_data_description_width)
-        row_format = row_format_list[0]
-        header_sep = header_sep_list[0]
-        line_length = line_length_list[0]
-        # construct table string
-        append(add_title(line_length, "Operator Summary"))
-        append('Time unit: {}'.format(time_unit))
-        append(header_sep)
-        append(row_format.format(*headers))
-        append(header_sep)
-        for row_values in all_row_values:
-            if isinstance(row_values, str):
-                append(add_title(line_length, row_values))
-            else:
-                append(row_format.format(*row_values))
-        append(header_sep)
-        append('')
-        append('')
-    ###### Print Kernel Summary Report ######
-    if statistic_data.event_summary.kernel_items:
-        all_row_values = []
-        kernel_items = statistic_data.event_summary.kernel_items
-        if sorted_by == SortedKeys.GPUAvg:
-            sorted_items = sorted(kernel_items.items(),
-                                  key=lambda x: x[1].avg_gpu_time,
-                                  reverse=True)
-        elif sorted_by == SortedKeys.GPUMax:
-            sorted_items = sorted(kernel_items.items(),
-                                  key=lambda x: x[1].max_gpu_time,
-                                  reverse=True)
-        elif sorted_by == SortedKeys.GPUMin:
-            sorted_items = sorted(kernel_items.items(),
-                                  key=lambda x: x[1].min_gpu_time)
-        else:
-            sorted_items = sorted(kernel_items.items(),
-                                  key=lambda x: x[1].gpu_time,
-                                  reverse=True)
-        total_kernel_gpu_time = 0
-        for name, item in sorted_items:
-            total_kernel_gpu_time += item.gpu_time
-        for name, item in sorted_items:
-            if total_kernel_gpu_time == 0:
-                gpu_ratio = 0
-            else:
-                gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
-            row_values = [
-                name,
-                item.call,
-                '{} / {} / {} / {} / {}'.format(
-                    format_time(item.gpu_time, unit=time_unit),
-                    format_time(item.avg_gpu_time, unit=time_unit),
-                    format_time(item.max_gpu_time, unit=time_unit),
-                    format_time(item.min_gpu_time, unit=time_unit),
-                    format_ratio(gpu_ratio)),
            ]
-            all_row_values.append(row_values)
+            row_format_list = [""]
+            header_sep_list = [""]
+            line_length_list = [-SPACING_SIZE]
+            add_column(name_column_width)
+            add_column(calltime_width)
+            add_column(cpu_data_description_width)
+            add_column(gpu_data_description_width)
-        headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)']
+            row_format = row_format_list[0]
-        # Calculate the column width
+            header_sep = header_sep_list[0]
-        name_column_width = 90
+            line_length = line_length_list[0]
-        calltime_width = 6
-        gpu_data_description_width = 40
-        for row_values in all_row_values:
-            if isinstance(row_values[1],
-                          int) and len(str(row_values[1])) > calltime_width:
-                calltime_width = len(str(row_values[1]))
-            if len(row_values[2]) > gpu_data_description_width:
-                gpu_data_description_width = len(row_values[2])
-        row_format_list = [""]
+            # construct table string
-        header_sep_list = [""]
+            append(add_title(line_length, "Operator Summary"))
-        line_length_list = [-SPACING_SIZE]
+            append('Time unit: {}'.format(time_unit))
-        add_column(name_column_width)
+            append(header_sep)
-        add_column(calltime_width)
+            append(row_format.format(*headers))
-        add_column(gpu_data_description_width)
+            append(header_sep)
+            for row_values in all_row_values:
+                if isinstance(row_values, str):
+                    append(add_title(line_length, row_values))
+                else:
+                    append(row_format.format(*row_values))
+            append(header_sep)
+            append('')
+            append('')
-        row_format = row_format_list[0]
+    if views is None or SummaryView.KernelView in views:
-        header_sep = header_sep_list[0]
-        line_length = line_length_list[0]
-        # construct table string
+        ###### Print Kernel Summary Report ######
-        append(add_title(line_length, "Kernel Summary"))
+        if statistic_data.event_summary.kernel_items:
-        append('Time unit: {}'.format(time_unit))
+            all_row_values = []
-        append(header_sep)
+            kernel_items = statistic_data.event_summary.kernel_items
-        append(row_format.format(*headers))
+            if sorted_by == SortedKeys.GPUAvg:
-        append(header_sep)
+                sorted_items = sorted(kernel_items.items(),
-        kernel_name_pattern = re.compile('(.+?)(<.*>)(\(.*\))')
+                                      key=lambda x: x[1].avg_gpu_time,
-        for row_values in all_row_values:
+                                      reverse=True)
-            match = kernel_name_pattern.match(row_values[0])
+            elif sorted_by == SortedKeys.GPUMax:
-            if match:
+                sorted_items = sorted(kernel_items.items(),
-                name = match.group(1) + match.group(2)
+                                      key=lambda x: x[1].max_gpu_time,
-            else:
+                                      reverse=True)
-                name = row_values[0]
+            elif sorted_by == SortedKeys.GPUMin:
-            if len(name) > name_column_width:
+                sorted_items = sorted(kernel_items.items(),
-                row_values[0] = name[:name_column_width - 3] + '...'
+                                      key=lambda x: x[1].min_gpu_time)
            else:
-                row_values[0] = name
+                sorted_items = sorted(kernel_items.items(),
-            append(row_format.format(*row_values))
+                                      key=lambda x: x[1].gpu_time,
-        append(header_sep)
+                                      reverse=True)
-        append('')
-        append('')
-    ###### Print Memory Manipulation Summary Report ######
+            total_kernel_gpu_time = 0
-    if statistic_data.event_summary.memory_manipulation_items:
+            for name, item in sorted_items:
-        all_row_values = []
+                total_kernel_gpu_time += item.gpu_time
-        memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+            for name, item in sorted_items:
-        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+                if total_kernel_gpu_time == 0:
-            'ProfileStep'].general_gpu_time
+                    gpu_ratio = 0
-        for name, item in memory_manipulation_items.items():
+                else:
-            if gpu_total_time == 0:
+                    gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
-                gpu_ratio = 0
+                row_values = [
-            else:
+                    name,
-                gpu_ratio = float(item.general_gpu_time) / gpu_total_time
+                    item.call,
-            row_values = [
+                    '{} / {} / {} / {} / {}'.format(
-                name,
+                        format_time(item.gpu_time, unit=time_unit),
-                item.call,
+                        format_time(item.avg_gpu_time, unit=time_unit),
-                '{} / {} / {} / {} / {}'.format(
+                        format_time(item.max_gpu_time, unit=time_unit),
-                    format_time(item.cpu_time, unit=time_unit),
+                        format_time(item.min_gpu_time, unit=time_unit),
-                    format_time(item.avg_cpu_time, unit=time_unit),
+                        format_ratio(gpu_ratio)),
-                    format_time(item.max_cpu_time, unit=time_unit),
+                ]
-                    format_time(item.min_cpu_time, unit=time_unit),
+                all_row_values.append(row_values)
-                    format_ratio(float(item.cpu_time) / total_time)),
-                '{} / {} / {} / {} / {}'.format(
+            headers = [
-                    format_time(item.general_gpu_time, unit=time_unit),
+                'Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)'
-                    format_time(item.avg_general_gpu_time, unit=time_unit),
-                    format_time(item.max_general_gpu_time, unit=time_unit),
-                    format_time(item.min_general_gpu_time, unit=time_unit),
-                    format_ratio(gpu_ratio)),
            ]
-            all_row_values.append(row_values)
+            # Calculate the column width
+            name_column_width = 90
+            calltime_width = 6
+            gpu_data_description_width = 40
+            for row_values in all_row_values:
+                if isinstance(row_values[1],
+                              int) and len(str(row_values[1])) > calltime_width:
+                    calltime_width = len(str(row_values[1]))
+                if len(row_values[2]) > gpu_data_description_width:
+                    gpu_data_description_width = len(row_values[2])
-        headers = [
+            row_format_list = [""]
-            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            header_sep_list = [""]
-            'GPU Total / Avg / Max / Min / Ratio(%)'
+            line_length_list = [-SPACING_SIZE]
-        ]
+            add_column(name_column_width)
-        # Calculate the column width
+            add_column(calltime_width)
-        name_column_width = 0
+            add_column(gpu_data_description_width)
-        calltime_width = 6
-        cpu_data_description_width = 40
-        gpu_data_description_width = 40
-        for row_values in all_row_values:
-            if len(row_values[0]) > name_column_width:
-                name_column_width = len(row_values[0])
-            if isinstance(row_values[1],
-                          int) and len(str(row_values[1])) > calltime_width:
-                calltime_width = len(str(row_values[1]))
-            if len(row_values[2]) > cpu_data_description_width:
-                cpu_data_description_width = len(row_values[2])
-            if len(row_values[3]) > gpu_data_description_width:
-                gpu_data_description_width = len(row_values[3])
-        row_format_list = [""]
+            row_format = row_format_list[0]
-        header_sep_list = [""]
+            header_sep = header_sep_list[0]
-        line_length_list = [-SPACING_SIZE]
+            line_length = line_length_list[0]
-        add_column(name_column_width)
-        add_column(calltime_width)
-        add_column(cpu_data_description_width)
-        add_column(gpu_data_description_width)
-        row_format = row_format_list[0]
+            # construct table string
-        header_sep = header_sep_list[0]
+            append(add_title(line_length, "Kernel Summary"))
-        line_length = line_length_list[0]
+            append('Time unit: {}'.format(time_unit))
+            append(header_sep)
+            append(row_format.format(*headers))
+            append(header_sep)
+            kernel_name_pattern = re.compile('(.+?)(<.*>)(\(.*\))')
+            for row_values in all_row_values:
+                match = kernel_name_pattern.match(row_values[0])
+                if match:
+                    name = match.group(1) + match.group(2)
+                else:
+                    name = row_values[0]
+                if len(name) > name_column_width:
+                    row_values[0] = name[:name_column_width - 3] + '...'
+                else:
+                    row_values[0] = name
+                append(row_format.format(*row_values))
+            append(header_sep)
+            append('')
+            append('')
-        # construct table string
+    if views is None or SummaryView.MemoryManipulationView in views:
-        append(add_title(line_length, "Memory Manipulation Summary"))
-        append('Time unit: {}'.format(time_unit))
-        append(header_sep)
-        append(row_format.format(*headers))
-        append(header_sep)
-        for row_values in all_row_values:
-            append(row_format.format(*row_values))
-        append(header_sep)
-        append('')
-        append('')
-    ###### Print UserDefined Summary Report ######
-    if statistic_data.event_summary.userdefined_items:
-        all_row_values = []
-        gpu_total_time = statistic_data.event_summary.model_perspective_items[
-            'ProfileStep'].general_gpu_time
-        if thread_sep == True:
-            userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
-        else:
-            userdefined_thread_items = {
-                'All threads merged':
-                statistic_data.event_summary.userdefined_items
-            }
-        for thread_id, items in userdefined_thread_items.items():
-            all_row_values.append("Thread: {}".format(thread_id))
-            if sorted_by == SortedKeys.CPUTotal:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUAvg:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].avg_cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUMax:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].max_cpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.CPUMin:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].min_cpu_time)
-            elif sorted_by == SortedKeys.GPUTotal:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUAvg:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].avg_general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUMax:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].max_general_gpu_time,
-                                      reverse=True)
-            elif sorted_by == SortedKeys.GPUMin:
-                sorted_items = sorted(items.items(),
-                                      key=lambda x: x[1].min_general_gpu_time)
-            for name, item in sorted_items:
+        ###### Print Memory Manipulation Summary Report ######
+        if statistic_data.event_summary.memory_manipulation_items:
+            all_row_values = []
+            memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+            gpu_total_time = statistic_data.event_summary.model_perspective_items[
+                'ProfileStep'].general_gpu_time
+            for name, item in memory_manipulation_items.items():
                if gpu_total_time == 0:
                    gpu_ratio = 0
                else:
@@ -1524,116 +1446,164 @@ def _build_table(statistic_data,
                ]
                all_row_values.append(row_values)
-        # Calculate the column width
+            headers = [
-        name_column_width = 0
+                'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-        calltime_width = 6
+                'GPU Total / Avg / Max / Min / Ratio(%)'
-        cpu_data_description_width = 40
+            ]
-        gpu_data_description_width = 40
+            # Calculate the column width
-        for row_values in all_row_values:
+            name_column_width = 0
-            if isinstance(row_values, str):
+            calltime_width = 6
-                continue
+            cpu_data_description_width = 40
-            if len(row_values[0]) > name_column_width:
+            gpu_data_description_width = 40
-                name_column_width = len(row_values[0])
+            for row_values in all_row_values:
-            if isinstance(row_values[1],
+                if len(row_values[0]) > name_column_width:
-                          int) and len(str(row_values[1])) > calltime_width:
+                    name_column_width = len(row_values[0])
-                calltime_width = len(str(row_values[1]))
+                if isinstance(row_values[1],
-            if len(row_values[2]) > cpu_data_description_width:
+                              int) and len(str(row_values[1])) > calltime_width:
-                cpu_data_description_width = len(row_values[2])
+                    calltime_width = len(str(row_values[1]))
-            if len(row_values[3]) > gpu_data_description_width:
+                if len(row_values[2]) > cpu_data_description_width:
-                gpu_data_description_width = len(row_values[3])
+                    cpu_data_description_width = len(row_values[2])
+                if len(row_values[3]) > gpu_data_description_width:
-        headers = [
+                    gpu_data_description_width = len(row_values[3])
-            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-            'GPU Total / Avg / Max / Min / Ratio(%)'
-        ]
-        row_format_list = [""]
-        header_sep_list = [""]
-        line_length_list = [-SPACING_SIZE]
-        add_column(name_column_width)
+            row_format_list = [""]
-        add_column(calltime_width)
+            header_sep_list = [""]
-        add_column(cpu_data_description_width)
+            line_length_list = [-SPACING_SIZE]
-        add_column(gpu_data_description_width)
+            add_column(name_column_width)
+            add_column(calltime_width)
+            add_column(cpu_data_description_width)
+            add_column(gpu_data_description_width)
-        row_format = row_format_list[0]
+            row_format = row_format_list[0]
-        header_sep = header_sep_list[0]
+            header_sep = header_sep_list[0]
-        line_length = line_length_list[0]
+            line_length = line_length_list[0]
-        # construct table string
+            # construct table string
-        append(add_title(line_length, "UserDefined Summary"))
+            append(add_title(line_length, "Memory Manipulation Summary"))
-        append('Time unit: {}'.format(time_unit))
+            append('Time unit: {}'.format(time_unit))
-        append(header_sep)
+            append(header_sep)
-        append(row_format.format(*headers))
+            append(row_format.format(*headers))
-        append(header_sep)
+            append(header_sep)
-        for row_values in all_row_values:
+            for row_values in all_row_values:
-            if isinstance(row_values, str):
-                append(add_title(line_length, row_values))
-            else:
                append(row_format.format(*row_values))
-        append('')
+            append(header_sep)
-        append('')
+            append('')
+            append('')
-    ###### Print Memory Summary Report ######
-    if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
-        for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
-        ):
-            all_row_values = []
-            sorted_items = sorted(memory_events.items(),
-                                  key=lambda x: x[1].increase_size,
-                                  reverse=True)
-            for event_name, item in sorted_items:
+    if views is None or SummaryView.UDFView in views:
-                row_values = [
-                    event_name, item.memory_type, item.allocation_count,
-                    item.free_count, item.allocation_size, item.free_size,
-                    item.increase_size
-                ]
-                all_row_values.append(row_values)
-            sorted_reserved_items = sorted(statistic_data.memory_summary.
+        ###### Print UserDefined Summary Report ######
-                                           reserved_items[device_type].items(),
+        if statistic_data.event_summary.userdefined_items:
-                                           key=lambda x: x[1].increase_size,
+            all_row_values = []
-                                           reverse=True)
+            gpu_total_time = statistic_data.event_summary.model_perspective_items[
-            for event_name, item in sorted_reserved_items:
+                'ProfileStep'].general_gpu_time
-                row_values = [
+            if thread_sep == True:
-                    event_name, item.memory_type, item.allocation_count,
+                userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
-                    item.free_count, item.allocation_size, item.free_size,
+            else:
-                    item.increase_size
+                userdefined_thread_items = {
-                ]
+                    'All threads merged':
-                all_row_values.append(row_values)
+                    statistic_data.event_summary.userdefined_items
+                }
+            for thread_id, items in userdefined_thread_items.items():
+                all_row_values.append("Thread: {}".format(thread_id))
+                if sorted_by == SortedKeys.CPUTotal:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUAvg:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].avg_cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUMax:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].max_cpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.CPUMin:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].min_cpu_time)
+                elif sorted_by == SortedKeys.GPUTotal:
+                    sorted_items = sorted(items.items(),
+                                          key=lambda x: x[1].general_gpu_time,
+                                          reverse=True)
+                elif sorted_by == SortedKeys.GPUAvg:
+                    sorted_items = sorted(
+                        items.items(),
+                        key=lambda x: x[1].avg_general_gpu_time,
+                        reverse=True)
+                elif sorted_by == SortedKeys.GPUMax:
+                    sorted_items = sorted(
+                        items.items(),
+                        key=lambda x: x[1].max_general_gpu_time,
+                        reverse=True)
+                elif sorted_by == SortedKeys.GPUMin:
+                    sorted_items = sorted(
+                        items.items(), key=lambda x: x[1].min_general_gpu_time)
+                for name, item in sorted_items:
+                    if gpu_total_time == 0:
+                        gpu_ratio = 0
+                    else:
+                        gpu_ratio = float(
+                            item.general_gpu_time) / gpu_total_time
+                    row_values = [
+                        name,
+                        item.call,
+                        '{} / {} / {} / {} / {}'.format(
+                            format_time(item.cpu_time, unit=time_unit),
+                            format_time(item.avg_cpu_time, unit=time_unit),
+                            format_time(item.max_cpu_time, unit=time_unit),
+                            format_time(item.min_cpu_time, unit=time_unit),
+                            format_ratio(float(item.cpu_time) / total_time)),
+                        '{} / {} / {} / {} / {}'.format(
+                            format_time(item.general_gpu_time, unit=time_unit),
+                            format_time(item.avg_general_gpu_time,
+                                        unit=time_unit),
+                            format_time(item.max_general_gpu_time,
+                                        unit=time_unit),
+                            format_time(item.min_general_gpu_time,
+                                        unit=time_unit),
+                            format_ratio(gpu_ratio)),
+                    ]
+                    all_row_values.append(row_values)
            # Calculate the column width
+            name_column_width = 0
+            calltime_width = 6
+            cpu_data_description_width = 40
+            gpu_data_description_width = 40
+            for row_values in all_row_values:
+                if isinstance(row_values, str):
+                    continue
+                if len(row_values[0]) > name_column_width:
+                    name_column_width = len(row_values[0])
+                if isinstance(row_values[1],
+                              int) and len(str(row_values[1])) > calltime_width:
+                    calltime_width = len(str(row_values[1]))
+                if len(row_values[2]) > cpu_data_description_width:
+                    cpu_data_description_width = len(row_values[2])
+                if len(row_values[3]) > gpu_data_description_width:
+                    gpu_data_description_width = len(row_values[3])
            headers = [
-                'Name', 'Type', 'Allocation Count', 'Free Count',
+                'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
-                'Allocation Size', 'Free Size', 'Increased Size'
+                'GPU Total / Avg / Max / Min / Ratio(%)'
            ]
            row_format_list = [""]
            header_sep_list = [""]
            line_length_list = [-SPACING_SIZE]
-            name_column_width = 50
-            number_column_width = 15
            add_column(name_column_width)
-            add_column(12)
+            add_column(calltime_width)
-            add_column(number_column_width)
+            add_column(cpu_data_description_width)
-            add_column(number_column_width)
+            add_column(gpu_data_description_width)
-            add_column(number_column_width)
-            add_column(number_column_width)
-            add_column(number_column_width)
            row_format = row_format_list[0]
            header_sep = header_sep_list[0]
            line_length = line_length_list[0]
            # construct table string
-            append(
+            append(add_title(line_length, "UserDefined Summary"))
-                add_title(line_length,
+            append('Time unit: {}'.format(time_unit))
-                          "Memory Summary - {}".format(device_type)))
-            append('Peak Allocated Memory: {}'.format(
-                statistic_data.memory_summary.
-                peak_allocation_values[device_type]))
-            append('Peak Reserved Memory: {}'.format(
-                statistic_data.memory_summary.peak_reserved_values[device_type])
-                   )
            append(header_sep)
            append(row_format.format(*headers))
            append(header_sep)
@@ -1645,4 +1615,79 @@ def _build_table(statistic_data,
            append('')
            append('')
+    if views is None or SummaryView.MemoryView in views:
+        ###### Print Memory Summary Report ######
+        if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
+            for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
+            ):
+                all_row_values = []
+                sorted_items = sorted(memory_events.items(),
+                                      key=lambda x: x[1].increase_size,
+                                      reverse=True)
+                for event_name, item in sorted_items:
+                    row_values = [
+                        event_name, item.memory_type, item.allocation_count,
+                        item.free_count, item.allocation_size, item.free_size,
+                        item.increase_size
+                    ]
+                    all_row_values.append(row_values)
+                sorted_reserved_items = sorted(
+                    statistic_data.memory_summary.reserved_items[device_type].
+                    items(),
+                    key=lambda x: x[1].increase_size,
+                    reverse=True)
+                for event_name, item in sorted_reserved_items:
+                    row_values = [
+                        event_name, item.memory_type, item.allocation_count,
+                        item.free_count, item.allocation_size, item.free_size,
+                        item.increase_size
+                    ]
+                    all_row_values.append(row_values)
+                # Calculate the column width
+                headers = [
+                    'Name', 'Type', 'Allocation Count', 'Free Count',
+                    'Allocation Size', 'Free Size', 'Increased Size'
+                ]
+                row_format_list = [""]
+                header_sep_list = [""]
+                line_length_list = [-SPACING_SIZE]
+                name_column_width = 50
+                number_column_width = 15
+                add_column(name_column_width)
+                add_column(12)
+                add_column(number_column_width)
+                add_column(number_column_width)
+                add_column(number_column_width)
+                add_column(number_column_width)
+                add_column(number_column_width)
+                row_format = row_format_list[0]
+                header_sep = header_sep_list[0]
+                line_length = line_length_list[0]
+                # construct table string
+                append(
+                    add_title(line_length,
+                              "Memory Summary - {}".format(device_type)))
+                append('Peak Allocated Memory: {}'.format(
+                    statistic_data.memory_summary.
+                    peak_allocation_values[device_type]))
+                append('Peak Reserved Memory: {}'.format(
+                    statistic_data.memory_summary.
+                    peak_reserved_values[device_type]))
+                append(header_sep)
+                append(row_format.format(*headers))
+                append(header_sep)
+                for row_values in all_row_values:
+                    if isinstance(row_values, str):
+                        append(add_title(line_length, row_values))
+                    else:
+                        append(row_format.format(*row_values))
+                append('')
+                append('')
    return ''.join(result)