diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index ac2b205e61128eb984b1235660aa04668dc047cc..ae804f82b90f70f08833cb3306f7b02531c051a9 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -20,6 +20,7 @@ import tempfile
 
 import paddle
 import paddle.profiler as profiler
+import paddle.profiler.utils as utils
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.io import Dataset, DataLoader
@@ -40,11 +41,17 @@ class TestProfiler(unittest.TestCase):
         with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
             y = x / 2.0
         prof = None
+        self.assertEqual(utils._is_profiler_used, False)
+        with profiler.RecordEvent(name='test'):
+            y = x / 2.0
+
         with profiler.Profiler(
                 targets=[profiler.ProfilerTarget.CPU],
                 scheduler=(1, 2)) as prof:
+            self.assertEqual(utils._is_profiler_used, True)
             with profiler.RecordEvent(name='test'):
                 y = x / 2.0
+
         prof = None
         with profiler.Profiler(
                 targets=[profiler.ProfilerTarget.CPU],
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index dc944e68c7f555e9feca9e4c4861b60b90de6b60..7079d9678b2fde023e566ca22b18f06cfc0c4c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase):
         profilerstep_node = HostPythonNode('ProfileStep#1',
                                            profiler.TracerEventType.ProfileStep,
                                            0, 400, 1000, 1001)
-        dataloader_node = HostPythonNode(
-            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
         yolonet_node = HostPythonNode(
@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 100)
+                profiler.TracerEventType.Forward), 90)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase):
                 profiler.TracerEventType.Communication), 5)
         self.assertEqual(len(event_summary.items), 2)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 4)
+        self.assertEqual(len(event_summary.model_perspective_items), 5)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].general_gpu_time,
             135)
@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase):
                                            profiler.TracerEventType.ProfileStep,
                                            0, 400, 1000, 1001)
 
-        dataloader_node = HostPythonNode(
-            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
 
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 100)
+                profiler.TracerEventType.Forward), 90)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase):
                 distributed_summary.overlap_range), 85)
         self.assertEqual(len(event_summary.items), 4)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 4)
+        self.assertEqual(len(event_summary.model_perspective_items), 5)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].general_gpu_time,
             315)
@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase):
                 thread_sep=False,
                 time_unit='ms'))
 
+    def test_statistic_case3(self):
+        # for coverage, test all time is 0
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          60, 70, 1000, 1001)
+
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
+
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            25, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        25, 25, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
+            1000, 1001)
+
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        another_kernel = DevicePythonNode(
+            'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
+            profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, userdefined_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_launchkernel.device_node.append(another_kernel)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
+        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
+        self.assertEqual(event_summary.userdefined_items['Communication Time']
+                         .general_gpu_time, 0)
+        for sort_key in [
+                profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
+                profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
+                profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax,
+                profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
+        ]:
+            print(
+                profiler.profiler_statistic._build_table(
+                    statistic_data,
+                    sorted_by=sort_key,
+                    op_detail=True,
+                    thread_sep=False,
+                    time_unit='ms'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 2fae583397a8e904562b38dc727fba32aa5ca525..ca35348e4cd090e9d37a10ccc3c028fe526141ee 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -27,6 +27,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
 
 from .utils import RecordEvent, wrap_optimizers
 from .profiler_statistic import StatisticData, _build_table, SortedKeys
+from paddle.profiler import utils
 from .timer import benchmark
 
 
@@ -475,6 +476,7 @@ class Profiler:
         if self.timer_only:
             return
         # CLOSED -> self.current_state
+        utils._is_profiler_used = True
         if self.current_state == ProfilerState.READY:
             self.profiler.prepare()
         elif self.current_state == ProfilerState.RECORD:
@@ -527,6 +529,7 @@ class Profiler:
             self.profiler_result = self.profiler.stop()
             if self.on_trace_ready:
                 self.on_trace_ready(self)
+        utils._is_profiler_used = False
 
     def step(self, num_samples: Optional[int]=None):
         r"""
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index e4d4ff8c183bca9e5399b3cb378aefcd00bb9d67..422dbe4ce359f4d2e0e42672ab683602d84e2e08 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -78,15 +78,19 @@ class HostStatisticNode:
         self.self_gpu_time = 0
         self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
         self.self_general_gpu_time = 0
+        self.is_terminal_operator_node = True
 
     def cal_statistic(self):
         for child in self.children_node:
             child.cal_statistic()
+            if child.is_terminal_operator_node == False:
+                self.is_terminal_operator_node = False
         for rt in self.runtime_node:
             rt.cal_statistic()
-
         self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
         for child in self.children_node:
+            if child.type == TracerEventType.Operator:
+                self.is_terminal_operator_node = False
             self.gpu_time += child.gpu_time
             self.general_gpu_time += child.general_gpu_time
             self.self_cpu_time -= (child.end_ns - child.start_ns)
@@ -419,10 +423,10 @@ class EventSummary:
 
             for runtimenode in node.runtime_node:
                 for devicenode in runtimenode.device_node:
-                    if devicenode.name not in self.devices:
-                        self.devices[devicenode.name] = EventSummary.DeviceItem(
-                            devicenode.name)
-                    self.devices[devicenode.name].add_item(devicenode)
+                    name = devicenode.name
+                    if name not in self.devices:
+                        self.devices[name] = EventSummary.DeviceItem(name)
+                    self.devices[name].add_item(devicenode)
 
     class GeneralItem:
         def __init__(self, name):
@@ -489,6 +493,7 @@ class EventSummary:
             dict)  # for userdefined summary
         self.model_perspective_items = {}  # for model summary
         self.memory_manipulation_items = {}  # for memory manipulation summary
+        self.kernel_items = {}  # for kernel summary
 
     def parse(self, nodetrees):
         r"""
@@ -508,6 +513,7 @@ class EventSummary:
                         self.add_memory_manipulation_item(host_statistic_node)
                     else:
                         self.add_userdefined_item(host_statistic_node)
+            self.add_kernel_item(host_statistic_nodes[0])
 
         for threadid, root_statistic_node in node_statistic_trees.items():
             deque = collections.deque()
@@ -525,11 +531,7 @@ class EventSummary:
                         deque.append(child)
 
     def add_operator_item(self, operator_node):
-        have_inner = False
-        for child in operator_node.children_node:
-            if child.type == TracerEventType.OperatorInner:
-                have_inner = True
-        if have_inner == False:
+        if operator_node.is_terminal_operator_node == False:
             return
         if operator_node.name not in self.items:
             self.items[operator_node.name] = EventSummary.OperatorItem(
@@ -585,6 +587,15 @@ class EventSummary:
             self.model_perspective_items[name] = EventSummary.GeneralItem(name)
         self.model_perspective_items[name].add_item(model_perspective_node)
 
+    def add_kernel_item(self, root_node):
+        device_nodes = get_device_nodes(root_node)
+        for device_node in device_nodes:
+            if device_node.type == TracerEventType.Kernel:
+                name = device_node.name
+                if name not in self.kernel_items:
+                    self.kernel_items[name] = EventSummary.DeviceItem(name)
+                self.kernel_items[name].add_item(device_node)
+
 
 class StatisticData:
     r"""
@@ -752,6 +763,9 @@ def _build_table(statistic_data,
             cpu_call_times[
                 event_type] = statistic_data.event_summary.model_perspective_items[
                     event_type_name].call
+            cpu_type_time[
+                event_type] = statistic_data.event_summary.model_perspective_items[
+                    event_type_name].cpu_time
 
     gpu_time_range = collections.defaultdict(list)
     for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
@@ -800,7 +814,6 @@ def _build_table(statistic_data,
     append(
         "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
         "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
-        "The time with ratio 100% is the base time for calculating ratio. \n"
         "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
         "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
         "Example:\n"
@@ -820,13 +833,18 @@ def _build_table(statistic_data,
         all_row_values = []
         accmulation_time = 0
         gpu_accmulation_time = 0
-        gpu_total_time = 0
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         for name in [
                 'ProfileStep', 'Dataloader', 'Forward', 'Backward',
                 'Optimization'
         ]:
             if name in model_perspective_items:
                 item = model_perspective_items[name]
+                if gpu_total_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / gpu_total_time
                 name = '{}'.format(
                     name) if 'ProfileStep' in name else '  {}'.format(name)
                 row_values = [
@@ -850,17 +868,19 @@ def _build_table(statistic_data,
                             item.max_gpu_time, unit=time_unit),
                         format_time(
                             item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_time))
+                        format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
                 if 'ProfileStep' not in name:
                     accmulation_time += item.cpu_time
-                    gpu_accmulation_time += item.gpu_time
-                else:
-                    gpu_total_time = item.gpu_time
+                    gpu_accmulation_time += item.general_gpu_time
 
         other_time = total_time - accmulation_time
         other_gpu_time = gpu_total_time - gpu_accmulation_time
+        if gpu_total_time == 0:
+            gpu_ratio = 0
+        else:
+            gpu_ratio = float(other_gpu_time) / gpu_total_time
         row_values = [
             '  Others', '-', '{} / - / - / - / {}'.format(
                 format_time(
@@ -869,7 +889,7 @@ def _build_table(statistic_data,
             '{} / - / - / - / {}'.format(
                 format_time(
                     other_gpu_time, unit=time_unit),
-                format_ratio(float(other_gpu_time) / gpu_total_time))
+                format_ratio(gpu_ratio))
         ]
         all_row_values.append(row_values)
         # Calculate the column width
@@ -913,7 +933,6 @@ def _build_table(statistic_data,
         append(
             "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
             "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
-            "The time with ratio 100% is the base time for calculating ratio. \n"
         )
         append('-' * line_length)
         append('')
@@ -981,7 +1000,6 @@ def _build_table(statistic_data,
             "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
             "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
             "Overlap time: Communication time intersects with computation time.\n"
-            "The time with ratio 100% is the base time for calculating ratio. \n"
             "Example:\n"
             "Communication:\n"
             "  CPU:              |_________________|\n"
@@ -1040,8 +1058,22 @@ def _build_table(statistic_data,
             elif sorted_by == SortedKeys.GPUMin:
                 sorted_items = sorted(
                     items.items(), key=lambda x: x[1].min_general_gpu_time)
+            total_op_cpu_time = 0
+            total_op_gpu_time = 0
+
+            for name, item in sorted_items:
+                total_op_cpu_time += item.cpu_time
+                total_op_gpu_time += item.general_gpu_time
 
             for name, item in sorted_items:
+                if total_op_cpu_time == 0:
+                    cpu_ratio = 0
+                else:
+                    cpu_ratio = float(item.cpu_time) / total_op_cpu_time
+                if total_op_gpu_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
                         format_time(
@@ -1052,7 +1084,7 @@ def _build_table(statistic_data,
                             item.max_cpu_time, unit=time_unit),
                         format_time(
                             item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_time)),
+                        format_ratio(cpu_ratio)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
                             item.general_gpu_time, unit=time_unit),
@@ -1062,13 +1094,22 @@ def _build_table(statistic_data,
                             item.max_general_gpu_time, unit=time_unit),
                         format_time(
                             item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(
-                            float(item.general_gpu_time) / total_time))
+                        format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
                 if op_detail:
                     for innerop_name, innerop_node in item.operator_inners.items(
                     ):
+                        if item.cpu_time == 0:
+                            cpu_ratio = 0
+                        else:
+                            cpu_ratio = float(
+                                innerop_node.cpu_time) / item.cpu_time
+                        if item.general_gpu_time == 0:
+                            gpu_ratio = 0
+                        else:
+                            gpu_ratio = float(innerop_node.general_gpu_time
+                                              ) / item.general_gpu_time
                         if len(innerop_name) + 2 > name_column_width:
                             innerop_name = innerop_name[:name_column_width - 5]
                             innerop_name += "..."
@@ -1083,8 +1124,7 @@ def _build_table(statistic_data,
                                     innerop_node.max_cpu_time, unit=time_unit),
                                 format_time(
                                     innerop_node.min_cpu_time, unit=time_unit),
-                                format_ratio(
-                                    float(innerop_node.cpu_time) / total_time)),
+                                format_ratio(cpu_ratio)),
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
                                     innerop_node.general_gpu_time,
@@ -1098,13 +1138,17 @@ def _build_table(statistic_data,
                                 format_time(
                                     innerop_node.min_general_gpu_time,
                                     unit=time_unit),
-                                format_ratio(
-                                    float(innerop_node.general_gpu_time) /
-                                    total_time))
+                                format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
                         for device_node_name, device_node in innerop_node.devices.items(
                         ):
+                            if innerop_node.general_gpu_time == 0:
+                                gpu_ratio = 0
+                            else:
+                                gpu_ratio = float(
+                                    device_node.
+                                    gpu_time) / innerop_node.general_gpu_time
                             if len(device_node_name) + 4 > name_column_width:
                                 device_node_name = device_node_name[:
                                                                     name_column_width
@@ -1125,12 +1169,15 @@ def _build_table(statistic_data,
                                     format_time(
                                         device_node.min_gpu_time,
                                         unit=time_unit),
-                                    format_ratio(
-                                        float(device_node.gpu_time) /
-                                        total_time))
+                                    format_ratio(gpu_ratio))
                             ]
                             all_row_values.append(row_values)
                     for device_node_name, device_node in item.devices.items():
+                        if item.general_gpu_time == 0:
+                            gpu_ratio = 0
+                        else:
+                            gpu_ratio = float(
+                                device_node.gpu_time) / item.general_gpu_time
                         if len(device_node_name) + 2 > name_column_width:
                             device_node_name = device_node_name[:
                                                                 name_column_width
@@ -1148,8 +1195,7 @@ def _build_table(statistic_data,
                                     device_node.max_gpu_time, unit=time_unit),
                                 format_time(
                                     device_node.min_gpu_time, unit=time_unit),
-                                format_ratio(
-                                    float(device_node.gpu_time) / total_time))
+                                format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
         # Calculate the column width
@@ -1197,11 +1243,106 @@ def _build_table(statistic_data,
         append('')
         append('')
 
+    ###### Print Kernel Summary Report ######
+    if statistic_data.event_summary.kernel_items:
+        all_row_values = []
+        kernel_items = statistic_data.event_summary.kernel_items
+        if sorted_by == SortedKeys.GPUAvg:
+            sorted_items = sorted(
+                kernel_items.items(),
+                key=lambda x: x[1].avg_gpu_time,
+                reverse=True)
+        elif sorted_by == SortedKeys.GPUMax:
+            sorted_items = sorted(
+                kernel_items.items(),
+                key=lambda x: x[1].max_gpu_time,
+                reverse=True)
+        elif sorted_by == SortedKeys.GPUMin:
+            sorted_items = sorted(
+                kernel_items.items(), key=lambda x: x[1].min_gpu_time)
+        else:
+            sorted_items = sorted(
+                kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+
+        total_kernel_gpu_time = 0
+        for name, item in sorted_items:
+            total_kernel_gpu_time += item.gpu_time
+        for name, item in sorted_items:
+            if total_kernel_gpu_time == 0:
+                gpu_ratio = 0
+            else:
+                gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
+            row_values = [
+                name,
+                item.call,
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.gpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_gpu_time, unit=time_unit),
+                    format_time(
+                        item.max_gpu_time, unit=time_unit),
+                    format_time(
+                        item.min_gpu_time, unit=time_unit),
+                    format_ratio(gpu_ratio)),
+            ]
+            all_row_values.append(row_values)
+
+        headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)']
+        # Calculate the column width
+        name_column_width = 90
+        calltime_width = 6
+        gpu_data_description_width = 40
+        for row_values in all_row_values:
+            if isinstance(row_values[1],
+                          int) and len(str(row_values[1])) > calltime_width:
+                calltime_width = len(str(row_values[1]))
+            if len(row_values[2]) > gpu_data_description_width:
+                gpu_data_description_width = len(row_values[2])
+
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        add_column(name_column_width)
+        add_column(calltime_width)
+        add_column(gpu_data_description_width)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Kernel Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        for row_values in all_row_values:
+            indx = row_values[0].find('(')
+            if indx != -1:
+                name = row_values[0][:indx]
+            else:
+                name = row_values[0]
+            if len(name) > name_column_width:
+                row_values[0] = name[:name_column_width - 3] + '...'
+            else:
+                row_values[0] = name
+            append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+
     ###### Print Memory Manipulation Summary Report ######
     if statistic_data.event_summary.memory_manipulation_items:
         all_row_values = []
         memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         for name, item in memory_manipulation_items.items():
+            if gpu_total_time == 0:
+                gpu_ratio = 0
+            else:
+                gpu_ratio = float(item.general_gpu_time) / gpu_total_time
             row_values = [
                 name,
                 item.call,
@@ -1224,7 +1365,7 @@ def _build_table(statistic_data,
                         item.max_general_gpu_time, unit=time_unit),
                     format_time(
                         item.min_general_gpu_time, unit=time_unit),
-                    format_ratio(float(item.general_gpu_time) / total_time)),
+                    format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
 
@@ -1274,6 +1415,8 @@ def _build_table(statistic_data,
     ###### Print UserDefined Summary Report ######
     if statistic_data.event_summary.userdefined_items:
         all_row_values = []
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         if thread_sep == True:
             userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
         else:
@@ -1319,6 +1462,10 @@ def _build_table(statistic_data,
                     items.items(), key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
+                if gpu_total_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / gpu_total_time
                 row_values = [
                     name,
                     item.call,
@@ -1341,8 +1488,7 @@ def _build_table(statistic_data,
                             item.max_general_gpu_time, unit=time_unit),
                         format_time(
                             item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(
-                            float(item.general_gpu_time) / total_time)),
+                        format_ratio(gpu_ratio)),
                 ]
                 all_row_values.append(row_values)
 
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 291326478e91bcf6ccce4bee42dc5b77f853cc15..6ae3fe4e60b927803e8b4df5affaac6f6d5f3801 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -20,6 +20,8 @@ from contextlib import ContextDecorator
 from paddle.fluid import core
 from paddle.fluid.core import (_RecordEvent, TracerEventType)
 
+_is_profiler_used = False
+
 _AllowedEventTypeList = [
     TracerEventType.Dataloader, TracerEventType.ProfileStep,
     TracerEventType.UserDefined, TracerEventType.Forward,
@@ -91,6 +93,8 @@ class RecordEvent(ContextDecorator):
                 result = data1 - data2
                 record_event.end()
         """
+        if not _is_profiler_used:
+            return
         if self.event_type not in _AllowedEventTypeList:
             warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
                   can be recorded.".format(*_AllowedEventTypeList))