Add profiler backend

d251028d · chenjian · GitHub · 8b1815f3 · d251028d · d251028d
19 changed file
--- a/visualdl/component/base_component.py
+++ b/visualdl/component/base_component.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =======================================================================
-
-from visualdl.proto.record_pb2 import Record
 import numpy as np
 from PIL import Image

+from visualdl.proto.record_pb2 import Record
+

 def scalar(tag, value, step, walltime=None):
    """Package data to one scalar.
-
    Args:
        tag (string): Data identifier
        value (float): Value of scalar
@@ -52,8 +51,7 @@ def meta_data(tag='meta_data_tag', display_name="", step=0, walltime=None):
    """
    meta = Record.MetaData(display_name=display_name)
    return Record(values=[
-        Record.Value(id=step, tag=tag, timestamp=walltime,
-                     meta_data=meta)
+        Record.Value(id=step, tag=tag, timestamp=walltime, meta_data=meta)
    ])


@@ -82,8 +80,8 @@ def imgarray2bytes(np_array):


 def make_grid(I, ncols=8):  # noqa: E741
-    assert isinstance(
-        I, np.ndarray), 'plugin error, should pass numpy array here'
+    assert isinstance(I,
+                      np.ndarray), 'plugin error, should pass numpy array here'
    if I.shape[1] == 1:
        I = np.concatenate([I, I, I], 1)  # noqa: E741
    assert I.ndim == 4 and I.shape[1] == 3 or I.shape[1] == 4
@@ -113,9 +111,11 @@ def convert_to_HWC(tensor, input_format):
    Return:
        Image of format `HWC`.
    """
-    assert(len(set(input_format)) == len(input_format)), "You can not use the same dimension shordhand twice. \
+    assert (len(set(input_format)) == len(input_format)
+            ), "You can not use the same dimension shordhand twice. \
        input_format: {}".format(input_format)
-    assert(len(tensor.shape) == len(input_format)), "size of input tensor and input format are different. \
+    assert (len(tensor.shape) == len(input_format)
+            ), "size of input tensor and input format are different. \
        tensor shape: {}, input_format: {}".format(tensor.shape, input_format)
    input_format = input_format.upper()

@@ -129,7 +129,8 @@ def convert_to_HWC(tensor, input_format):
        index = [input_format.find(c) for c in 'HWC']
        tensor_HWC = tensor.transpose(index)
        if tensor_HWC.shape[2] == 1:
-            tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC], 2)
+            tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC],
+                                        2)
        return tensor_HWC

    if len(input_format) == 2:
@@ -202,7 +203,8 @@ def embedding(tag, labels, hot_vectors, step, labels_meta=None, walltime=None):
    for label, hot_vector in zip(labels, hot_vectors):
        if not isinstance(label, list):
            label = [label]
-        embeddings.embeddings.append(Record.Embedding(label=label, vectors=hot_vector))
+        embeddings.embeddings.append(
+            Record.Embedding(label=label, vectors=hot_vector))

    return Record(values=[
        Record.Value(
@@ -325,7 +327,8 @@ def hparam(name, hparam_dict, metric_list, walltime):
            hparamInfo.string_value = v
            hm.hparamInfos.append(hparamInfo)
        else:
-            print("The value of %s must be int, float or str, not %s" % (k, str(type(v))))
+            print("The value of %s must be int, float or str, not %s" %
+                  (k, str(type(v))))
    for metric in metric_list:
        metricInfo = Record.HParam.HparamInfo()
        metricInfo.name = metric
@@ -333,8 +336,7 @@ def hparam(name, hparam_dict, metric_list, walltime):
        hm.metricInfos.append(metricInfo)

    return Record(values=[
-        Record.Value(
-            id=1, tag="hparam", timestamp=walltime, hparam=hm)
+        Record.Value(id=1, tag="hparam", timestamp=walltime, hparam=hm)
    ])


@@ -389,7 +391,12 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
    return data


-def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127,
+def pr_curve(tag,
+             labels,
+             predictions,
+             step,
+             walltime,
+             num_thresholds=127,
             weights=None):
    """Package data to one pr_curve.

@@ -409,7 +416,8 @@ def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127,
    num_thresholds = min(num_thresholds, 127)
    prcurve_map = compute_curve(labels, predictions, num_thresholds, weights)

-    return pr_curve_raw(tag=tag,
+    return pr_curve_raw(
+        tag=tag,
        tp=prcurve_map['tp'],
        fp=prcurve_map['fp'],
        tn=prcurve_map['tn'],
@@ -441,7 +449,6 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
    Return:
        Package with format of record_pb2.Record
    """
-
    """
    if isinstance(tp, np.ndarray):
        tp = tp.astype(int).tolist()
@@ -456,15 +463,10 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
    if isinstance(recall, np.ndarray):
        recall = recall.astype(int).tolist()
    """
-    prcurve = Record.PRCurve(TP=tp,
-                             FP=fp,
-                             TN=tn,
-                             FN=fn,
-                             precision=precision,
-                             recall=recall)
+    prcurve = Record.PRCurve(
+        TP=tp, FP=fp, TN=tn, FN=fn, precision=precision, recall=recall)
    return Record(values=[
-        Record.Value(
-            id=step, tag=tag, timestamp=walltime, pr_curve=prcurve)
+        Record.Value(id=step, tag=tag, timestamp=walltime, pr_curve=prcurve)
    ])


@@ -518,7 +520,13 @@ def compute_roc_curve(labels, predictions, num_thresholds=None, weights=None):
    return data


-def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weights=None):
+def roc_curve(tag,
+              labels,
+              predictions,
+              step,
+              walltime,
+              num_thresholds=127,
+              weights=None):
    """Package data to one roc_curve.
    Args:
        tag (string): Data identifier
@@ -533,9 +541,11 @@ def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weig
        Package with format of record_pb2.Record
    """
    num_thresholds = min(num_thresholds, 127)
-    roc_curve_map = compute_roc_curve(labels, predictions, num_thresholds, weights)
+    roc_curve_map = compute_roc_curve(labels, predictions, num_thresholds,
+                                      weights)

-    return roc_curve_raw(tag=tag,
+    return roc_curve_raw(
+        tag=tag,
        tp=roc_curve_map['tp'],
        fp=roc_curve_map['fp'],
        tn=roc_curve_map['tn'],
@@ -563,7 +573,6 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
    Return:
        Package with format of record_pb2.Record
    """
-
    """
    if isinstance(tp, np.ndarray):
        tp = tp.astype(int).tolist()
@@ -578,12 +587,7 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
    if isinstance(fpr, np.ndarray):
        fpr = fpr.astype(int).tolist()
    """
-    roc_curve = Record.ROC_Curve(TP=tp,
-                                 FP=fp,
-                                 TN=tn,
-                                 FN=fn,
-                                 tpr=tpr,
-                                 fpr=fpr)
+    roc_curve = Record.ROC_Curve(TP=tp, FP=fp, TN=tn, FN=fn, tpr=tpr, fpr=fpr)
    return Record(values=[
        Record.Value(
            id=step, tag=tag, timestamp=walltime, roc_curve=roc_curve)

--- a/visualdl/component/profiler/__init__.py
+++ b/visualdl/component/profiler/__init__.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
--- a/visualdl/component/profiler/parser/__init__.py
+++ b/visualdl/component/profiler/parser/__init__.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
--- a/visualdl/component/profiler/parser/const_description.py
+++ b/visualdl/component/profiler/parser/const_description.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+__ALL__ = [
+    'TOOLTIP_DEVICE_INFO_CN', 'TOOLTIP_MODEL_PERSPECTIVE_CN',
+    'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN',
+    'TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN',
+    'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN', 'TOOLTIP_DEVICE_INFO_EN',
+    'TOOLTIP_MODEL_PERSPECTIVE_EN', 'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN',
+    'TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN',
+    'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN'
+]
+
+TOOLTIP_DEVICE_INFO_CN = \
+  '<b class="bold">CPU进程利用率:</b><br>'\
+  '进程所利用到的CPU的时间 / ProfileStep的时间(即性能分析的时间跨度）<br>'\
+  '<b class="bold">CPU系统利用率:</b><br>'\
+  '整个系统所有进程利用到的CPU时间 / CPU总时间（ProfileStep的时间*CPU核心数）<br>'\
+  '<b class="bold">GPU利用率:</b><br>'\
+  '进程利用GPU计算的时间 / ProfileStep的时间，进程利用GPU计算的时间即是GPU Kernel计算的时间，越高越好<br>'\
+  '<b class="bold">流处理器效率:</b><br>'\
+  '对于流处理器处理某个GPU Kernel, 其效率为SM_Eff_i = min(Kernel所用的Blocks数量 / GPU的流处理器数量, 100%)。'\
+  '流处理器效率为SM_Eff_i关于每个Kernel的执行时间加权和 / ProfileStep的时间<br>'\
+  '<b class="bold">流处理器占用率:</b><br>'\
+  '对于流处理器处理某个GPU Kernel, 其占用率Occu_i = 为活跃的warp数 / 能支持的最大warp数。流处理器占用率为Occu_i关于每个Kernel执行时间的加权平均<br>'\
+  '<b class="bold">Tensor cores使用时间占比:</b><br>'\
+  '使用Tensor Cores的GPU Kernel的计算时间 / 所有Kernel的计算时间<br>'
+
+TOOLTIP_MODEL_PERSPECTIVE_CN = \
+  '展示模型各阶段DataLoader, Forward, Backward, Optimization以及Other的总CPU和GPU时间。<br>'\
+  'CPU时间即是各阶段代码执行的时间，GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'\
+  '<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'\
+  '<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'\
+  '<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'\
+  '<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'\
+  '<b class="bold">Other:</b> 其它时间<br>'
+
+TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN = \
+  '展示每一个ProfileStep内模型各阶段DataLoader, Forward, Backward, Optimization以及Other的CPU和GPU时间。<br>'\
+  'CPU时间即是各阶段代码执行的时间，GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'\
+  '<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'\
+  '<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'\
+  '<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'\
+  '<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'\
+  '<b class="bold">Other:</b> 其它时间<br>'
+
+TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN = \
+  '展示不同类型的事件在模型各阶段DataLoader, Forward, Backward, Optimization以及Other的分布。<br>'\
+  '<b class="bold">Operator:</b> 表示框架内的算子执行<br>'\
+  '<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'\
+  '<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'\
+  '<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'\
+  '<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'\
+  '<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'\
+  '<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'\
+  '<b class="bold">Communication:</b> 表示分布式通信有关的事件<br>'
+
+TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN = \
+  '展示在模型各阶段DataLoader, Forward, Backward, Optimization以及Other所包含的各种事件的时间。<br>'\
+  '<b class="bold">Operator:</b> 表示框架内的算子执行<br>'\
+  '<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'\
+  '<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'\
+  '<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'\
+  '<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'\
+  '<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'\
+  '<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'\
+  '<b class="bold">Communication:</b> 表示分布式通信有关的数据通信和计算事件<br>'
+
+TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN = \
+  '展示模型在每个迭代过程中通信、计算以及两者重叠部分的时间。<br>'\
+  '<b class="bold">ProfileStep:</b> 表示某一步迭代的总时间<br>'\
+  '<b class="bold">Communication:</b> 表示和通信相关的时间，包括框架内打的Communication事件、和通信有关的算子和Kernel（nccl)执行的时间<br>'\
+  '<b class="bold">Computation:</b> 表示GPU Kernel计算的时间，但是去除了和通信有关的Kernel(nccl)<br>'\
+  '<b class="bold">Overlap:</b> 表示通信和计算过程并行执行时候时间相互重叠的部分<br>'\
+  '<b class="bold">Others:</b> 表示通信和计算之外的时间<br>'
+
+TOOLTIP_DEVICE_INFO_EN = \
+  '<b class="bold">CPU Process Utilization:</b><br>'\
+  'Process CPU time / ProfileStep time(total time of profiling）<br>'\
+  '<b class="bold">CPU System Utilization:</b><br>'\
+  'Sum of system\'s all processes CPU time/ CPU total time（ProfileStep time* #CPU Core)<br>'\
+  '<b class="bold">GPU Utilization:</b><br>'\
+  'GPU busy time / ProfileStep time，GPU busy time is the time during in which at least one GPU kernel is\
+  running on it.<br>'\
+  '<b class="bold">Est. SM Efficiency:</b><br>'\
+  'The SM efficiency for one kernel can be denoted as SM_Eff_i = min(blocks of this kernel / SM number \
+  of this GPU, 100%).'\
+  'Est. SM efficiency of GPU is the weighted sum of SM_Eff_i across all kernels / ProfileStep time<br>'\
+  '<b class="bold">Est. Achieved Occupancy:</b><br>'\
+  'The SM occupancy for one kernel can be denoted as Occu_i = active warps on an SM / maximum number \
+    of active warps supported by the SM. \
+  Est. SM occupancy of GPU is the weighted average of Occu_i across all kernels<br>'\
+  '<b class="bold">Tensor cores ratio:</b><br>'\
+  'Sum of kernel time using Tensor Cores / Sum of total kernel time<br>'
+
+TOOLTIP_MODEL_PERSPECTIVE_EN = \
+  'Present CPU and GPU time for each stage of a model, i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'\
+  'CPU time is the execution time for code，GPU time is the calculation time of kernels launched in the stage.<br>'\
+  '<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'\
+  '<b class="bold">Forward:</b> denote model forward<br>'\
+  '<b class="bold">Backward:</b> denote gradient back-propagate<br>'\
+  '<b class="bold">Optimization:</b> denote parameters update<br>'\
+  '<b class="bold">Other:</b> other time out of above range'
+
+TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN = \
+  'Present CPU and GPU time in each ProfileStep for each stage of a model, \
+  i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'\
+  'CPU time is the execution time for code，GPU time is the calculation time of kernels launched in the stage.<br>'\
+  '<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'\
+  '<b class="bold">Forward:</b> denote model forward<br>'\
+  '<b class="bold">Backward:</b> denote gradient back-propagate<br>'\
+  '<b class="bold">Optimization:</b> denote parameters update<br>'\
+  '<b class="bold">Other:</b> other time out of above range'
+
+TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN = \
+  'Present the distribution of each kind of events across DataLoader,\
+  Forward, Backward, Optimization and Other stage.<br>'\
+  '<b class="bold">Operator:</b> denote operator execution<br>'\
+  '<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'\
+  '<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'\
+  '<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'\
+  '<b class="bold">Memset:</b> denote memory data set on GPU<br>'\
+  '<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'\
+  '<b class="bold">OperatorInner:</b> denote operator\'s subprocess execution<br>'\
+  '<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
+
+TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN = \
+  'Present the time of each kind of events included in DataLoader, Forward, Backward, Optimization \
+    and Other stage.<br>'\
+  '<b class="bold">Operator:</b> denote operator execution<br>'\
+  '<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'\
+  '<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'\
+  '<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'\
+  '<b class="bold">Memset:</b> denote memory data set on GPU<br>'\
+  '<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'\
+  '<b class="bold">OperatorInner:</b> denote operator\'s subprocess execution<br>'\
+  '<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
+
+TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN = \
+  'Present the time of communication, computation and their overlap in program.<br>'\
+  '<b class="bold">ProfileStep:</b> denote an iteration step of training process<br>'\
+  '<b class="bold">Communication:</b> denote the time related to communication, including events of communication type\
+     in paddle framework、communication-related operators and GPU Kernels(nccl)<br>'\
+  '<b class="bold">Computation:</b> denote the computation \
+    time of GPU Kernels，except communication-related Kernels(nccl)<br>'\
+  '<b class="bold">Overlap:</b> denote the overlap time between Communication and \
+    Computation when they are executed parallelly.<br>'\
+  '<b class="bold">Others:</b> denote the time out of Communication and Computation<br>'
--- a/visualdl/component/profiler/parser/distributed_parser.py
+++ b/visualdl/component/profiler/parser/distributed_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+from collections import defaultdict
+
+from .utils import get_device_nodes
+from .utils import intersection_ranges
+from .utils import merge_ranges
+from .utils import merge_self_ranges
+from .utils import rebuild_node_trees
+from .utils import sum_ranges
+from .utils import traverse_tree
+
+_CommunicationOpName = ['allreduce', 'broadcast', 'rpc']
+
+
+class DistributedParser:
+    r"""
+    Analysis communication and computation time range, and their overlap.
+    The computation time is all kernel except kernels for communication like nccl.
+    """
+
+    def __init__(self):
+        self.steps_data = defaultdict(lambda: defaultdict(list))
+        self.calls = defaultdict(lambda: defaultdict(int))
+        self.steps_time = defaultdict(lambda: defaultdict(float))
+        self.profile_steps_time = {}
+
+    def parse(self, nodetrees):
+        '''
+        Collect all communication and computation time ranges.
+        '''
+        total_time = 0.0
+        nodetrees = rebuild_node_trees(nodetrees)
+        thread2hostnodes = traverse_tree(nodetrees)
+        thread_count = 0
+        for threadid, hostnodes in thread2hostnodes.items():
+            for hostnode in hostnodes[1:]:  # skip root node
+                # case 1: TracerEventType is Communication
+                if hostnode.type == 'ProfileStep':
+                    if thread_count == 0:
+                        total_time += (hostnode.end_ns - hostnode.start_ns)
+                    self._parse_step(hostnode)
+                    continue
+            thread_count += 1
+
+        new_steps_data = defaultdict(lambda: defaultdict(list))
+        self.profile_steps_time['All'] = total_time
+        for step, step_data in self.steps_data.items():
+            self.calls[step]['cpu_communication_range'] = len(
+                step_data['cpu_communication_range'])
+            self.calls[step]['gpu_communication_range'] = len(
+                step_data['gpu_communication_range'])
+            new_steps_data[step][
+                'cpu_communication_range'] = merge_self_ranges(
+                    step_data['cpu_communication_range'], is_sorted=False)
+            new_steps_data[step][
+                'gpu_communication_range'] = merge_self_ranges(
+                    step_data['gpu_communication_range'], is_sorted=False)
+            new_steps_data[step]['communication_range'] = merge_ranges(
+                new_steps_data[step]['cpu_communication_range'],
+                new_steps_data[step]['gpu_communication_range'],
+                is_sorted=True)
+            new_steps_data[step]['computation_range'] = merge_self_ranges(
+                step_data['computation_range'], is_sorted=False)
+            new_steps_data[step]['overlap_range'] = intersection_ranges(
+                new_steps_data[step]['communication_range'],
+                new_steps_data[step]['computation_range'],
+                is_sorted=True)
+            self.steps_time[step]['communication_time'] = sum_ranges(
+                new_steps_data[step]['communication_range'])
+            self.steps_time[step]['computation_time'] = sum_ranges(
+                new_steps_data[step]['computation_range'])
+            self.steps_time[step]['overlap_time'] = sum_ranges(
+                new_steps_data[step]['overlap_range'])
+            self.steps_time[step]['others_time'] = self.profile_steps_time[
+                step] - self.steps_time[step][
+                    'communication_time'] - self.steps_time[step][
+                        'computation_time'] + self.steps_time[step][
+                            'overlap_time']
+        self.steps_data = new_steps_data
+
+    def _parse_step(self, profile_step_node):
+        step = profile_step_node.name.split('#')[1]
+        self.profile_steps_time[
+            step] = profile_step_node.end_ns - profile_step_node.start_ns
+        nodes = []
+        stack = []
+        stack.append(profile_step_node)
+        while stack:
+            current_node = stack.pop()
+            nodes.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+        for hostnode in nodes:
+            if hostnode.type == 'Communication':
+                self.steps_data[step]['cpu_communication_range'].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.steps_data['All']['cpu_communication_range'].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                device_nodes = get_device_nodes(hostnode)
+                for device_node in device_nodes:
+                    if device_node.type == 'Kernel':
+                        self.steps_data[step][
+                            'gpu_communication_range'].append(
+                                (device_node.start_ns, device_node.end_ns))
+                        self.steps_data['All'][
+                            'gpu_communication_range'].append(
+                                (device_node.start_ns, device_node.end_ns))
+
+            # case 2: TracerEventType is Operator but is communication op
+            elif hostnode.type == 'Operator' and any([
+                    name in hostnode.name.lower()
+                    for name in _CommunicationOpName
+            ]):
+                self.steps_data[step]['cpu_communication_range'].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.steps_data['All']['cpu_communication_range'].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                device_nodes = get_device_nodes(hostnode)
+                for device_node in device_nodes:
+                    if device_node.type == 'Kernel':
+                        self.steps_data[step][
+                            'gpu_communication_range'].append(
+                                (device_node.start_ns, device_node.end_ns))
+                        self.steps_data['All'][
+                            'gpu_communication_range'].append(
+                                (device_node.start_ns, device_node.end_ns))
+
+            # case 3: Others, filter kernels named with nccl
+            else:
+                for runtimenode in hostnode.runtime_node:
+                    for devicenode in runtimenode.device_node:
+                        if devicenode.type == 'Kernel':
+                            if 'nccl' in devicenode.name.lower():
+                                self.steps_data[step][
+                                    'gpu_communication_range'].append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
+                                self.steps_data['All'][
+                                    'gpu_communication_range'].append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
+                            else:
+                                self.steps_data[step][
+                                    'computation_range'].append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
+                                self.steps_data['All'][
+                                    'computation_range'].append(
+                                        (devicenode.start_ns,
+                                         devicenode.end_ns))
--- a/visualdl/component/profiler/parser/event_node.py
+++ b/visualdl/component/profiler/parser/event_node.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import collections
+import functools
+import json
+import re
+import sys
+
+_show_name_pattern = re.compile(r'(.+)(\[.+\])')
+_show_tid_pattern = re.compile(r'\w+(\(.+\))')
+
+host_node_type_map = {
+    "Operator", "Dataloader", "ProfileStep", "CudaRuntime", "UserDefined",
+    "OperatorInner", "Forward", "Backward", "Optimization", "Communication",
+    "PythonOp", "PythonUserDefined"
+}
+
+device_node_type_map = {"Kernel", "Memcpy", "Memset"}
+
+memory_node_event_map = {
+    "Allocate", "Free", "ReservedAllocate", "ReservedFree"
+}
+
+
+class HostNode:
+    def __init__(self):
+        self.name = None
+        self.type = None
+        self.start_ns = 0
+        self.end_ns = 0
+        self.process_id = 0
+        self.thread_id = 0
+        self.correlation_id = -1
+        self.input_shapes = {}
+        self.dtypes = {}
+        self.callstack = ""
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+        self.mem_node = []
+
+    @classmethod
+    def from_json(cls, json_obj):
+        self = cls()
+        self.name = json_obj['name'].replace(
+            _show_name_pattern.match(json_obj['name']).group(2), "")
+        self.type = json_obj['cat']
+        self.start_ns = int(
+            float(json_obj['args']['start_time'].split(' ')[0]) * 1000)
+        self.end_ns = int(
+            float(json_obj['args']['end_time'].split(' ')[0]) * 1000)
+        self.process_id = json_obj['pid']
+        self.thread_id = json_obj['tid'].replace(
+            _show_tid_pattern.match(json_obj['tid']).group(1), "")
+        self.correlation_id = json_obj['args'][
+            'correlation id'] if 'correlation id' in json_obj['args'] else -1
+        self.input_shapes = json_obj['args'][
+            'input_shapes'] if 'input_shapes' in json_obj['args'] else {}
+        self.dtypes = json_obj['args'][
+            'input_dtypes'] if 'input_dtypes' in json_obj['args'] else {}
+        self.callstack = json_obj['args'][
+            'callstack'] if 'callstack' in json_obj['args'] else ""
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+        self.mem_node = []
+        return self
+
+
+class MemNode:
+    def __init__(self):
+        self.type = None
+        self.timestamp_ns = 0
+        self.addr = 0
+        self.process_id = 0
+        self.thread_id = 0
+        self.increase_bytes = 0
+        self.place = None
+        self.current_allocated = 0
+        self.current_reserved = 0
+        self.peak_allocated = 0
+        self.peak_reserved = 0
+
+    @classmethod
+    def from_json(cls, json_obj):
+        self = cls()
+        self.type = json_obj['cat']
+        self.timestamp_ns = json_obj['ts'] * 1000
+        self.addr = hex(int(
+            json_obj['args']['addr'])) if 'addr' in json_obj['args'] else 0
+        self.process_id = json_obj['pid']
+        self.thread_id = json_obj['tid'].replace(
+            _show_tid_pattern.match(json_obj['tid']).group(1), "")
+        self.increase_bytes = json_obj['args'][
+            'increase_bytes'] if 'increase_bytes' in json_obj['args'] else 0
+        self.place = json_obj['args']['place'] if 'place' in json_obj[
+            'args'] else "Place(cpu)"
+        self.current_allocated = json_obj['args'][
+            'current_allocated'] if 'current_allocated' in json_obj[
+                'args'] else 0
+        self.current_reserved = json_obj['args'][
+            'current_reserved'] if 'current_reserved' in json_obj['args'] else 0
+        self.peak_allocated = json_obj['args'][
+            'peak_allocated'] if 'peak_allocated' in json_obj['args'] else 0
+        self.peak_reserved = json_obj['args'][
+            'peak_reserved'] if 'peak_reserved' in json_obj['args'] else 0
+        return self
+
+
+class DeviceNode:
+    def __init__(self):
+        self.name = None
+        self.type = None
+        self.start_ns = 0
+        self.end_ns = 0
+        self.device_id = 0
+        self.stream_id = 0
+        self.context_id = 0
+        self.correlation_id = 0
+        self.block_x, self.block_y, self.block_z = [0, 0, 0]
+        self.grid_x, self.grid_y, self.grid_z = [0, 0, 0]
+        self.shared_memory = 0
+        self.registers_per_thread = 0
+        self.num_bytes = 0
+        self.value = 0
+        self.occupancy = 0
+        self.blocks_per_sm = 0
+        self.warps_per_sm = 0
+
+    @classmethod
+    def from_json(cls, json_obj):
+        self = cls()
+        self.name = json_obj['name'].replace(
+            _show_name_pattern.match(json_obj['name']).group(2), "")
+        self.type = json_obj['cat']
+        self.start_ns = int(
+            float(json_obj['args']['start_time'].split(' ')[0]) * 1000)
+        self.end_ns = int(
+            float(json_obj['args']['end_time'].split(' ')[0]) * 1000)
+        self.device_id = json_obj['pid']
+        self.stream_id = json_obj['tid']
+        self.context_id = json_obj['args']['context'] if 'context' in json_obj[
+            'args'] else 0
+        self.correlation_id = json_obj['args']['correlation id']
+        self.block_x, self.block_y, self.block_z = json_obj['args'][
+            'block'] if 'block' in json_obj['args'] else [0, 0, 0]
+        self.grid_x, self.grid_y, self.grid_z = json_obj['args'][
+            'grid'] if 'grid' in json_obj['args'] else [0, 0, 0]
+        self.shared_memory = json_obj['args'][
+            'shared memory'] if 'shared memory' in json_obj['args'] else 0
+        self.registers_per_thread = json_obj['args'][
+            'registers per thread'] if 'registers per thread' in json_obj[
+                'args'] else 0
+        self.num_bytes = json_obj['args']['bytes'] if 'bytes' in json_obj[
+            'args'] else 0
+        self.value = json_obj['args']['value'] if 'value' in json_obj[
+            'args'] else 0
+        self.occupancy = json_obj['args'][
+            'theoretical achieved occupancy %'] if 'theoretical achieved occupancy %' in json_obj[
+                'args'] else 0
+        self.blocks_per_sm = json_obj['args'][
+            "blocks per SM"] if "blocks per SM" in json_obj['args'] else 0
+        self.warps_per_sm = json_obj['args'][
+            "warps per SM"] if "warps per SM" in json_obj['args'] else 0
+        return self
+
+
+class ProfilerResult:
+    def __init__(self, json_data):
+        self.device_infos = None
+        self.span_idx = None
+        self.data = None
+        self.extra_info = None
+        self.schema_version = None
+        self.has_hostnodes = True
+        self.has_devicenodes = True
+        self.has_memnodes = True
+        self.parse(json_data)
+        self.content = json_data
+        self.start_in_timeline_ns = None
+
+    def parse(self, json_data):
+        self.schema_version = json_data['schemaVersion']
+        self.span_idx = json_data['span_indx']
+        self.device_infos = {
+            device_info['id']: device_info
+            for device_info in json_data['deviceProperties']
+        }
+        hostnodes = []
+        runtimenodes = []
+        devicenodes = []
+        memnodes = []
+        for event in json_data['traceEvents']:
+            if not event or (event['ph'] != 'X' and event['ph'] != 'i'):
+                continue
+            if event['cat'] in host_node_type_map:
+                if event['cat'] == 'CudaRuntime' or event[
+                        'cat'] == 'MluRuntime':
+                    runtimenodes.append(HostNode.from_json(event))
+                else:
+                    hostnodes.append(HostNode.from_json(event))
+                    if hostnodes[-1].start_ns == 0:
+                        self.start_in_timeline_ns = int(event['ts']) * 1000
+            elif event['cat'] in device_node_type_map:
+                devicenodes.append(DeviceNode.from_json(event))
+            elif event['cat'] in memory_node_event_map:
+                memnodes.append(MemNode.from_json(event))
+        if memnodes:
+            for memnode in memnodes:
+                assert self.start_in_timeline_ns is not None
+                memnode.timestamp_ns = memnode.timestamp_ns - self.start_in_timeline_ns
+        if not hostnodes:
+            self.has_hostnodes = False
+        if not devicenodes:
+            self.has_devicenodes = False
+        if not memnodes:
+            self.has_memnodes = False
+
+        self.data = self.build_tree(hostnodes, runtimenodes, devicenodes,
+                                    memnodes)
+        self.extra_info = json_data['ExtraInfo']
+
+    def build_tree(  # noqa: C901
+            self, hostnodes, runtimenodes, devicenodes, memnodes):
+        thread2host_event_nodes = collections.defaultdict(list)
+        thread2runtime_event_nodes = collections.defaultdict(list)
+        thread2mem_event_nodes = collections.defaultdict(list)
+        correlation_id2runtime_event_node = {}
+        thread_event_trees = {}
+        thread_ids = set()
+        for hostnode in hostnodes:
+            thread2host_event_nodes[hostnode.thread_id].append(hostnode)
+            thread_ids.add(hostnode.thread_id)
+        # construct thread2runtime_event_nodes and correlation_id2runtime_event_node
+        for runtimenode in runtimenodes:
+            thread2runtime_event_nodes[runtimenode.thread_id].append(
+                runtimenode)
+            thread_ids.add(runtimenode.thread_id)
+            correlation_id2runtime_event_node[
+                runtimenode.correlation_id] = runtimenode
+
+        # associate CudaRuntimeTraceEventNode and DeviceTraceEventNode
+        # construct correlation_id2device_event_nodes
+        for devicenode in devicenodes:
+            if devicenode.correlation_id not in correlation_id2runtime_event_node:
+                continue
+            runtimenode = correlation_id2runtime_event_node[
+                devicenode.correlation_id]
+            runtimenode.device_node.append(devicenode)
+
+        # construct thread2mem_event_nodes
+        for memnode in memnodes:
+            thread2mem_event_nodes[memnode.thread_id].append(memnode)
+        # sort host event nodes and runtime event nodes according to start_ns and
+        # end_ns
+        # the smaller start_ns is, the further ahead position is.
+        # when start_ns of two nodes are equal, the one with bigger end_ns should be
+        # ahead.
+
+        def compare_hostnode_func(hostnode1, hostnode2):
+            if hostnode1.start_ns < hostnode2.start_ns:
+                return -1
+            if hostnode1.start_ns == hostnode2.start_ns:
+                if hostnode1.end_ns > hostnode2.end_ns:
+                    return -1
+            return 1
+
+        def compare_memnode_func(memnode1, memnode2):
+            if memnode1.timestamp_ns <= memnode2.timestamp_ns:
+                return -1
+            return 1
+
+        for threadid, hostnodes in thread2host_event_nodes.items():
+            thread2host_event_nodes[threadid] = sorted(
+                hostnodes, key=functools.cmp_to_key(compare_hostnode_func))
+        for threadid, runtimenodes in thread2runtime_event_nodes.items():
+            thread2runtime_event_nodes[threadid] = sorted(
+                runtimenodes, key=functools.cmp_to_key(compare_hostnode_func))
+        for threadid, memnodes in thread2mem_event_nodes.items():
+            thread2mem_event_nodes[threadid] = sorted(
+                memnodes, key=functools.cmp_to_key(compare_memnode_func))
+
+        # construct trees
+        for threadid in thread_ids:
+            thread_event_trees[threadid] = self._build_tree_relationship(
+                thread2host_event_nodes[threadid],
+                thread2runtime_event_nodes[threadid],
+                thread2mem_event_nodes[threadid])
+
+        return thread_event_trees
+
+    def _build_tree_relationship(  # noqa: C901
+            self, host_event_nodes, runtime_event_nodes, mem_event_nodes):
+        # root node
+        root_node = HostNode()
+        root_node.name, root_node.type, root_node.start_ns, root_node.end_ns = "root node", "UserDefined", \
+            0, sys.maxsize
+        # push root node into node_stack
+        node_stack = []
+        node_stack.append(root_node)
+        # handle host_event_nodes
+        for host_node in host_event_nodes:
+            while True:
+                stack_top_node = node_stack[-1]
+                if host_node.start_ns < stack_top_node.end_ns:
+                    stack_top_node.children_node.append(host_node)
+                    node_stack.append(host_node)
+                    break
+                else:
+                    node_stack.pop()
+                    # insert runtime node
+                    # select runtime nodes which time range within stack_top_node
+                    hasenter = False
+                    firstposition = 0
+                    lastposition = len(runtime_event_nodes)
+                    for i, runtimenode in enumerate(runtime_event_nodes):
+                        if runtimenode.start_ns >= stack_top_node.start_ns and \
+                                runtimenode.end_ns <= stack_top_node.end_ns:
+                            if not hasenter:
+                                firstposition = i
+                                hasenter = True
+                            stack_top_node.runtime_node.append(runtimenode)
+                        else:
+                            # from this runtime node, not within stack_top_node, erase the
+                            # nodes from runtime_event_nodes
+                            if runtimenode.start_ns > stack_top_node.end_ns:
+                                lastposition = i
+                                break
+                    if hasenter:
+                        del runtime_event_nodes[firstposition:lastposition]
+        # to insert left runtimenode into host_event_nodes
+        while node_stack:
+            stack_top_node = node_stack.pop()
+            # insert runtime node
+            # select runtime nodes which time range within stack_top_node
+            firstposition = 0
+            lastposition = len(runtime_event_nodes)
+            hasenter = False
+            for i, runtimenode in enumerate(runtime_event_nodes):
+                if runtimenode.start_ns >= stack_top_node.start_ns and runtimenode.end_ns <= stack_top_node.end_ns:
+                    if not hasenter:
+                        firstposition = i
+                        hasenter = True
+                    stack_top_node.runtime_node.append(runtimenode)
+                else:
+                    # from this runtime node, not within stack_top_node, erase the
+                    # nodes from runtime_event_nodes
+                    if runtimenode.start_ns > stack_top_node.end_ns:
+                        lastposition = i
+                        break
+            if hasenter:
+                del runtime_event_nodes[firstposition:lastposition]
+
+        # build relationship between host event node and mem event node
+        # First, post-order traverse the tree. Then, insert the memory and op
+        # supplement node into correct host nodes.
+        stack = []
+        flag_stack = []
+        post_order_nodes = []
+        stack.append(root_node)
+        flag_stack.append(0)
+        while stack:
+            current_node = stack.pop()
+            flag = flag_stack.pop()
+            if flag == 0:
+                stack.append(current_node)
+                flag_stack.append(1)
+                for child in current_node.children_node[::-1]:
+                    stack.append(child)
+                    flag_stack.append(0)
+            else:
+                post_order_nodes.append(current_node)
+        for node in post_order_nodes:
+            hasenter = False
+            firstposition = 0
+            lastposition = len(mem_event_nodes)
+            for i, mem_node in enumerate(mem_event_nodes):
+                if mem_node.timestamp_ns >= node.start_ns and mem_node.timestamp_ns <= node.end_ns:
+                    node.mem_node.append(mem_node)
+                    if not hasenter:
+                        firstposition = i
+                        hasenter = True
+                else:
+                    if mem_node.timestamp_ns > node.end_ns:
+                        lastposition = i
+                        break
+            if hasenter:
+                del mem_event_nodes[firstposition:lastposition]
+
+        return root_node
+
+    def get_data(self):
+        return self.data
+
+    def get_extra_info(self):
+        return self.extra_info
+
+    def get_schema_version(self):
+        return self.schema_version
+
+    def get_device_infos(self):
+        return self.device_infos
+
+    def get_span_idx(self):
+        return self.span_idx
+
+    def has_device(self):
+        return self.has_devicenodes
+
+    def has_host(self):
+        return self.has_hostnodes
+
+    def has_memory(self):
+        return self.has_memnodes
+
+    def save(self, path, format):
+        pass
+
+
+def load_profiler_json(file_name):
+    content = json.load(open(file_name, 'r'))
+    return ProfilerResult(content)
--- a/visualdl/component/profiler/parser/kernel_parser.py
+++ b/visualdl/component/profiler/parser/kernel_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import collections
+
+
+class TC_Allowlist(dict):
+    # Refer to https://github.com/NVIDIA/PyProf/blob/fd1b2902e3306119eee40ba6b6e8b2f816920c29/pyprof/prof/tc.py#L19
+    allowlist = [
+        'h884', 's884', 'h1688', 's1688', 'hmma', 'i8816', '16816',
+        'dgrad_1x1_stride_2x2', 'first_layer_wgrad_kernel', 'conv1x1',
+        'conv2d_c1_k1', 'direct_group', 'xmma_implicit_gemm',
+        'xmma_sparse_conv', 'xmma_warp_specialized_implicit_gemm', 'xmma_gemm',
+        'xmma_sparse_gemm', 'c1688'
+    ]
+
+    def __init__(self):
+        pass
+
+    def __contains__(self, item):
+        # If kernel name contains substring equal to any one in allowlist, then it uses tensor core.
+        for pattern in self.allowlist:
+            if pattern in item:
+                return True
+        return False
+
+
+_allow_list = TC_Allowlist()
+
+
+class DeviceItem:
+    def __init__(self, name):
+        self.name = name
+        self.call = 0
+        self.gpu_time = 0
+        self.max_gpu_time = 0
+        self.min_gpu_time = float('inf')
+        self.tensorcore_used = True if name in _allow_list else False
+        self.sum_blocks_per_sm = 0.0
+        self.sum_occupancy = 0.0
+
+    @property
+    def avg_gpu_time(self):
+        return self.gpu_time / self.call
+
+    def add_gpu_time(self, time):
+        if time > self.max_gpu_time:
+            self.max_gpu_time = time
+        if time < self.min_gpu_time:
+            self.min_gpu_time = time
+        self.gpu_time += time
+
+    def add_item(self, node):
+        self.call += 1
+        self.add_gpu_time(node.end_ns - node.start_ns)
+        self.sum_blocks_per_sm += node.blocks_per_sm
+        self.sum_occupancy += node.occupancy
+
+
+class KernelParser:
+    def __init__(self):
+        self.kernel_items = {}  # for kernel summary
+        self.kernel_items_with_op_name_attributes = collections.defaultdict(
+            dict)
+        self.gpu_ids = set()
+        self.occupancy = 0.0
+        self.sm_efficiency = 0.0
+        self.tensor_core_ratio = 0.0
+
+    def parse(self, nodelists):  # noqa: C901
+        total_duration = 0.0
+        weighted_occupancy = 0.0
+        weighted_sm_efficiency = 0.0
+        for threadid, nodes in nodelists.items():
+            for node in nodes:
+                if node.type == 'Operator':
+                    op_name = node.name
+                    for children in node.children_node:
+                        if children.type == 'OperatorInner':
+                            for runtime_node in children.runtime_node:
+                                for device_node in runtime_node.device_node:
+                                    if device_node.type == 'Kernel':
+                                        op_attribute_name = self._translate_op_name_attributes_to_string(
+                                            op_name, device_node)
+                                        if op_attribute_name not in self.kernel_items_with_op_name_attributes[
+                                                device_node.name]:
+                                            self.kernel_items_with_op_name_attributes[
+                                                device_node.name][
+                                                    op_attribute_name] = DeviceItem(
+                                                        device_node.name)
+                                        self.kernel_items_with_op_name_attributes[
+                                            device_node.
+                                            name][op_attribute_name].add_item(
+                                                device_node)
+                    for runtime_node in node.runtime_node:
+                        for device_node in runtime_node.device_node:
+                            if device_node.type == 'Kernel':
+                                op_attribute_name = self._translate_op_name_attributes_to_string(
+                                    op_name, device_node)
+                                if op_attribute_name not in self.kernel_items_with_op_name_attributes[
+                                        device_node.name]:
+                                    self.kernel_items_with_op_name_attributes[
+                                        device_node.
+                                        name][op_attribute_name] = DeviceItem(
+                                            device_node.name)
+                                self.kernel_items_with_op_name_attributes[
+                                    device_node.
+                                    name][op_attribute_name].add_item(
+                                        device_node)
+                elif node.type == 'OperatorInner':
+                    continue
+                op_name = node.name
+                for runtime_node in node.runtime_node:
+                    for device_node in runtime_node.device_node:
+                        if device_node.type == 'Kernel':
+                            op_attribute_name = self._translate_op_name_attributes_to_string(
+                                op_name, device_node)
+                            if op_attribute_name not in self.kernel_items_with_op_name_attributes[
+                                    device_node.name]:
+                                self.kernel_items_with_op_name_attributes[
+                                    device_node.
+                                    name][op_attribute_name] = DeviceItem(
+                                        device_node.name)
+                            self.kernel_items_with_op_name_attributes[
+                                device_node.name][op_attribute_name].add_item(
+                                    device_node)
+
+        for threadid, nodes in nodelists.items():
+            for node in nodes:
+                for runtime_node in node.runtime_node:
+                    for device_node in runtime_node.device_node:
+                        if device_node.type == 'Kernel':
+                            name = device_node.name
+                            if name not in self.kernel_items:
+                                self.kernel_items[name] = DeviceItem(name)
+                            self.kernel_items[name].add_item(device_node)
+                            weighted_occupancy += (
+                                device_node.occupancy / 100) * (
+                                    device_node.end_ns - device_node.start_ns)
+                            if device_node.blocks_per_sm > 1:
+                                sm_efficiency = 1
+                            else:
+                                sm_efficiency = device_node.blocks_per_sm
+                            weighted_sm_efficiency += sm_efficiency * (
+                                device_node.end_ns - device_node.start_ns)
+                            total_duration += (
+                                device_node.end_ns - device_node.start_ns)
+                            self.gpu_ids.add(device_node.device_id)
+        self.occupancy = weighted_occupancy / total_duration if total_duration != 0 else 0.0
+        self.sm_efficiency = weighted_sm_efficiency  # to divide ProfileStep time in ProfileData
+        total_time = 0
+        total_tensorcore_time = 0
+        for name, node in self.kernel_items.items():
+            if node.tensorcore_used:
+                total_tensorcore_time += node.gpu_time
+            total_time += node.gpu_time
+        self.tensor_core_ratio = total_tensorcore_time / total_time if total_time != 0 else 0.0
+
+    def _translate_op_name_attributes_to_string(self, op_name, event):
+        result = '{}-[{},{},{}]-[{},{},{}]-{}-{}'.format(
+            op_name, event.grid_x, event.grid_y, event.grid_z, event.block_x,
+            event.block_y, event.block_z, event.registers_per_thread,
+            event.shared_memory)
+        return result
--- a/visualdl/component/profiler/parser/memory_parser.py
+++ b/visualdl/component/profiler/parser/memory_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import collections
+
+from .utils import traverse_tree
+
+
+class MemoryItem:
+    def __init__(self, event_name, place, memory_type='Allocated'):
+        self.event_name = event_name
+        self.place = place
+        self.allocation_count = 0
+        self.free_count = 0
+        self.allocation_size = 0
+        self.free_size = 0
+        self.increase_size = 0
+        self.memory_type = memory_type
+
+    def add_memory_record(self, size, allocation_type):
+        if allocation_type == 'Allocate' or allocation_type == 'ReservedAllocate':
+            self.allocation_count += 1
+            self.allocation_size += size
+
+        elif allocation_type == 'Free' or allocation_type == 'ReservedFree':
+            self.free_count += 1
+            self.free_size -= size  # size is sign(-) when free.
+
+        else:
+            print("No corresponding type.")
+        self.increase_size = self.allocation_size - self.free_size
+
+
+class MemoryParser:
+    def __init__(self):
+        self.allocated_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.reserved_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.peak_allocation_values = collections.defaultdict(int)
+        self.peak_reserved_values = collections.defaultdict(int)
+        self.memory_events = collections.defaultdict(lambda: collections.
+                                                     defaultdict(list))
+        self.memory_curve = collections.defaultdict(lambda: collections.
+                                                    defaultdict(list))
+        self.paired_events = collections.defaultdict(list)
+        self.size_ranges = {}
+
+    def parse(self, nodetrees):  # noqa: C901
+        r"""
+        Analyse memory event in the nodetress.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, host_nodes in thread2hostnodes.items():
+            for host_node in host_nodes[1:]:  # skip root node
+                if host_node.type == 'OperatorInner':
+                    continue
+                if host_node.type == 'Operator':
+                    for child in host_node.children_node:
+                        self._analyse_node_memory(host_node.name, child)
+                self._analyse_node_memory(host_node.name, host_node)
+
+        # pair for memory events
+        for device_type, memory_events in self.memory_events.items():
+            max_size = 0
+            for (addr, memory_type), memory_lists in memory_events.items():
+                memory_lists = sorted(memory_lists, key=lambda x: x[0])
+                paired_results = []
+                for memory_list in memory_lists:
+                    timestamp, memory_type, hostnodename, size = memory_list
+                    if memory_type == 'Allocate' or memory_type == 'ReservedAllocate':
+                        if size > max_size:
+                            max_size = size
+                        if memory_type == 'Allocate':
+                            paired_results.append([
+                                addr, 'Allocated', hostnodename, timestamp,
+                                None, None, size
+                            ])
+                        else:
+                            paired_results.append([
+                                addr, 'ReservedAllocate', hostnodename,
+                                timestamp, None, None, size
+                            ])
+                    elif memory_type == 'Free' or memory_type == 'ReservedFree':
+                        if -size > max_size:
+                            max_size = -size
+                        if paired_results:
+                            if paired_results[-1][-3] is None:
+                                paired_results[-1][-3] = hostnodename
+                                paired_results[-1][-2] = timestamp
+                                self.paired_events[device_type].append(
+                                    paired_results.pop())
+                            else:
+                                if memory_type == 'Free':
+                                    paired_results.append([
+                                        addr, 'Allocated', None, None,
+                                        hostnodename, timestamp, -size
+                                    ])
+                                else:
+                                    paired_results.append([
+                                        addr, 'ReservedAllocate', None, None,
+                                        hostnodename, timestamp, -size
+                                    ])
+                                self.paired_events[device_type].append(
+                                    paired_results.pop())
+                        else:
+                            if memory_type == 'Free':
+                                paired_results.append([
+                                    addr, 'Allocated', None, None,
+                                    hostnodename, timestamp, -size
+                                ])
+                            else:
+                                paired_results.append([
+                                    addr, 'ReservedAllocate', None, None,
+                                    hostnodename, timestamp, -size
+                                ])
+                            self.paired_events[device_type].append(
+                                paired_results.pop())
+
+                self.paired_events[device_type].extend(paired_results)
+            self.size_ranges[device_type] = (0, max_size)
+
+    def _analyse_node_memory(self, event_name, node):
+        for memnode in node.mem_node:  # self mem node
+            if memnode.type == 'Allocate' or memnode.type == 'Free':
+                if event_name not in self.allocated_items[memnode.place]:
+                    self.allocated_items[
+                        memnode.place][event_name] = MemoryItem(
+                            event_name, memnode.place, 'Allocated')
+                self.allocated_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+                self.memory_events[memnode.place][(memnode.addr,
+                                                   'Allocated')].append([
+                                                       memnode.timestamp_ns,
+                                                       memnode.type,
+                                                       event_name,
+                                                       memnode.increase_bytes
+                                                   ])
+
+            elif memnode.type == 'ReservedAllocate' or memnode.type == 'ReservedFree':
+                if event_name not in self.reserved_items[memnode.place]:
+                    self.reserved_items[
+                        memnode.place][event_name] = MemoryItem(
+                            event_name, memnode.place, 'Reserved')
+                self.reserved_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+                self.memory_events[memnode.place][(memnode.addr,
+                                                   "Reserved")].append([
+                                                       memnode.timestamp_ns,
+                                                       memnode.type,
+                                                       event_name,
+                                                       memnode.increase_bytes
+                                                   ])
+            self.memory_curve[memnode.place]['Allocated'].append(
+                (memnode.timestamp_ns, memnode.current_allocated, event_name))
+            self.memory_curve[memnode.place]['Reserved'].append(
+                (memnode.timestamp_ns, memnode.current_reserved, event_name))
+            self.memory_curve[memnode.place]['PeakAllocated'].append(
+                (memnode.timestamp_ns, memnode.peak_allocated, event_name))
+            self.memory_curve[memnode.place]['PeakReserved'].append(
+                (memnode.timestamp_ns, memnode.peak_reserved, event_name))
+            self.peak_allocation_values[memnode.place] = max(
+                self.peak_allocation_values[memnode.place],
+                memnode.peak_allocated)
+            self.peak_reserved_values[memnode.place] = max(
+                self.peak_reserved_values[memnode.place],
+                memnode.peak_reserved)
--- a/visualdl/component/profiler/parser/operator_parser.py
+++ b/visualdl/component/profiler/parser/operator_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import collections
+
+from .kernel_parser import DeviceItem
+from .utils import wrap_tree
+
+
+class OperatorItem:
+    def __init__(self, name):
+        self.name = name
+        self.call = 0
+        self.cpu_time = 0
+        self.gpu_time = 0
+        self.max_cpu_time = 0
+        self.min_cpu_time = float('inf')
+        self.max_gpu_time = 0
+        self.min_gpu_time = float('inf')
+        self.devices = {}
+        self.operator_inners = {}
+        self.general_gpu_time = 0
+        self.min_general_gpu_time = float('inf')
+        self.max_general_gpu_time = 0
+
+    @property
+    def avg_cpu_time(self):
+        return self.cpu_time / self.call
+
+    @property
+    def avg_gpu_time(self):
+        return self.gpu_time / self.call
+
+    @property
+    def avg_general_gpu_time(self):
+        return self.general_gpu_time / self.call
+
+    def add_cpu_time(self, time):
+        if time > self.max_cpu_time:
+            self.max_cpu_time = time
+        if time < self.min_cpu_time:
+            self.min_cpu_time = time
+        self.cpu_time += time
+
+    def add_gpu_time(self, time):
+        if time > self.max_gpu_time:
+            self.max_gpu_time = time
+        if time < self.min_gpu_time:
+            self.min_gpu_time = time
+        self.gpu_time += time
+
+    def add_general_gpu_time(self, time):
+        if time > self.max_general_gpu_time:
+            self.max_general_gpu_time = time
+        if time < self.min_general_gpu_time:
+            self.min_general_gpu_time = time
+        self.general_gpu_time += time
+
+    def add_call(self):
+        self.call += 1
+
+    def add_item(self, node):
+        self.add_call()
+        self.add_cpu_time(node.cpu_time)
+        self.add_gpu_time(node.gpu_time)
+        self.add_general_gpu_time(node.general_gpu_time)
+        for child in node.children_node:
+            if child.type != 'Operator':
+                if child.name not in self.operator_inners:
+                    self.operator_inners[child.name] = OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+        for runtimenode in node.runtime_node:
+            for devicenode in runtimenode.device_node:
+                name = devicenode.name
+                if name not in self.devices:
+                    self.devices[name] = DeviceItem(name)
+                self.devices[name].add_item(devicenode)
+
+
+class OperatorParser:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.items_with_input_shape = collections.defaultdict(dict)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(
+            nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  # skip root node
+                if host_statistic_node.type == 'Operator':
+                    self.add_operator_item(host_statistic_node)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = OperatorItem(operator_node.name)
+        input_shape_str = self._translate_op_input_shape_to_string(
+            operator_node.input_shapes)
+        if input_shape_str not in self.items_with_input_shape[
+                operator_node.name]:
+            self.items_with_input_shape[
+                operator_node.name][input_shape_str] = OperatorItem(
+                    operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+        self.items_with_input_shape[
+            operator_node.name][input_shape_str].add_item(operator_node)
+
+    def _translate_op_input_shape_to_string(self, input_shape):
+        result = ''
+        for arg, shape in input_shape.items():
+            result += '{}-{}\t'.format(arg, shape)
+        return result
--- a/visualdl/component/profiler/parser/overview_parser.py
+++ b/visualdl/component/profiler/parser/overview_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import collections
+
+from .utils import merge_ranges
+from .utils import merge_self_ranges
+from .utils import rebuild_node_trees
+from .utils import sum_ranges
+from .utils import traverse_tree
+
+StageType = ['Dataloader', 'Forward', 'Backward', 'Optimization']
+
+CPUType = [
+    'Operator', 'CudaRuntime', 'UserDefined', 'OperatorInner', 'Communication',
+    'PythonOp', 'PythonUserDefined', 'MluRuntime'
+]
+
+GPUType = ['Kernel', 'Memcpy', 'Memset']
+
+
+class GeneralItem:
+    def __init__(self, name):
+        self.name = name
+        self.call = 0
+        self.cpu_time = 0
+        self.max_cpu_time = 0
+        self.min_cpu_time = float('inf')
+        self.gpu_time = 0
+        self.max_gpu_time = 0
+        self.min_gpu_time = float('inf')
+        self.general_gpu_time = 0
+        self.min_general_gpu_time = float('inf')
+        self.max_general_gpu_time = 0
+
+    @property
+    def avg_cpu_time(self):
+        return self.cpu_time / self.call
+
+    @property
+    def avg_gpu_time(self):
+        return self.gpu_time / self.call
+
+    @property
+    def avg_general_gpu_time(self):
+        return self.general_gpu_time / self.call
+
+    def add_cpu_time(self, time):
+        if time > self.max_cpu_time:
+            self.max_cpu_time = time
+        if time < self.min_cpu_time:
+            self.min_cpu_time = time
+        self.cpu_time += time
+
+    def add_gpu_time(self, time):
+        if time > self.max_gpu_time:
+            self.max_gpu_time = time
+        if time < self.min_gpu_time:
+            self.min_gpu_time = time
+        self.gpu_time += time
+
+    def add_general_gpu_time(self, time):
+        if time > self.max_general_gpu_time:
+            self.max_general_gpu_time = time
+        if time < self.min_general_gpu_time:
+            self.min_general_gpu_time = time
+        self.general_gpu_time += time
+
+    def add_call(self):
+        self.call += 1
+
+    def add_item(self, node):
+        self.add_call()
+        self.add_cpu_time(node.cpu_time)
+        self.add_gpu_time(node.gpu_time)
+        self.add_general_gpu_time(node.general_gpu_time)
+
+
+class ModelPerspectiveItem:
+    def __init__(self, name):
+        self.name = name
+        self.call = 0
+        self.cpu_time = 0
+        self.max_cpu_time = 0
+        self.min_cpu_time = float('inf')
+        self.gpu_time = 0
+        self.max_gpu_time = 0
+        self.min_gpu_time = float('inf')
+        self.cpu_times = {}
+        self.gpu_times = {}
+
+    @property
+    def avg_cpu_time(self):
+        return self.cpu_time / self.call
+
+    @property
+    def avg_gpu_time(self):
+        return self.gpu_time / self.call
+
+    def add_call(self):
+        self.call += 1
+
+    def add_cpu_time(self, time):
+        self.add_call()
+        if time > self.max_cpu_time:
+            self.max_cpu_time = time
+        if time < self.min_cpu_time:
+            self.min_cpu_time = time
+        self.cpu_time += time
+
+    def add_gpu_time(self, time):
+        if time > self.max_gpu_time:
+            self.max_gpu_time = time
+        if time < self.min_gpu_time:
+            self.min_gpu_time = time
+
+    def set_gpu_time(self, time):
+        '''
+        Use this to set total gpu time in case gpu time calculated by add_gpu_time include overlap.
+        '''
+        self.gpu_time = time
+
+
+class OverviewParser:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        # event name: GeneralItem
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.model_perspective_items = {}
+        # phase name:
+        #   device name:
+        #     stage idx:
+        #      thread name:
+        #        event type:
+        #             {"events" :[], "times": []}
+        self.events_per_stage = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: collections.defaultdict(
+                lambda: collections.defaultdict(
+                    lambda: collections.defaultdict(lambda: collections.
+                                                    defaultdict(list))))))
+        # phase name:
+        #   device name:
+        #     stage idx:
+        #      event type:
+        #           { "calls" :[], "times": [], "total_time": 0 }
+
+        self.merged_events_per_stage = collections.defaultdict(
+            lambda: collections.defaultdict(lambda: collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.
+                                                defaultdict(list)))))
+
+        self.stage_nums = 0
+        self.gpu_ulitization = 0.0
+        self.has_forward = False
+        self.has_device = False
+
+    def parse(self, nodetrees):  # noqa: C901
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        self._parse_events(nodetrees)
+        # statistic calling times
+        # merge time, get time summarization
+        for stage_name, stage_data in self.events_per_stage.items():
+            for device_name, steps_data in stage_data.items():
+                for step_idx, thread_data in steps_data.items():
+                    for thread_id, events in thread_data.items():
+                        for event_type, events_data in events.items():
+                            if 'calls' not in self.merged_events_per_stage[
+                                    stage_name][device_name][step_idx][
+                                        event_type]:
+                                self.merged_events_per_stage[stage_name][
+                                    device_name][step_idx][event_type][
+                                        'calls'] = 0
+                            if 'total_time' not in self.merged_events_per_stage[
+                                    stage_name][device_name][step_idx][
+                                        event_type]:
+                                self.merged_events_per_stage[stage_name][
+                                    device_name][step_idx][event_type][
+                                        'total_time'] = 0
+                            events_data['times'] = merge_self_ranges(
+                                events_data['times'], is_sorted=False)
+                            self.merged_events_per_stage[stage_name][
+                                device_name][step_idx][event_type][
+                                    'calls'] += len(events_data['events'])
+                            self.merged_events_per_stage[stage_name][device_name][step_idx][event_type]['times'] =\
+                                merge_ranges(
+                                self.merged_events_per_stage[stage_name][device_name][step_idx][event_type]['times'],
+                                events_data['times'], is_sorted=True)
+
+        # merge different stages into profile step
+        stage_names = list(self.merged_events_per_stage.keys())
+        self.merged_events_per_stage['ProfileStep']
+        for stage_name in stage_names:
+            stage_data = self.merged_events_per_stage[stage_name]
+            for device_name, steps_data in stage_data.items():
+                for step_idx, events in steps_data.items():
+                    for event_type, events_data in events.items():
+                        events_data['total_time'] = sum_ranges(
+                            events_data['times'])
+                        if 'calls' not in self.merged_events_per_stage[
+                                'ProfileStep'][device_name][step_idx][
+                                    event_type]:
+                            self.merged_events_per_stage['ProfileStep'][
+                                device_name][step_idx][event_type]['calls'] = 0
+                        if 'total_time' not in self.merged_events_per_stage[
+                                'ProfileStep'][device_name][step_idx][
+                                    event_type]:
+                            self.merged_events_per_stage['ProfileStep'][
+                                device_name][step_idx][event_type][
+                                    'total_time'] = 0
+                        self.merged_events_per_stage['ProfileStep'][
+                            device_name][step_idx][event_type][
+                                'calls'] += events_data['calls']
+                        self.merged_events_per_stage['ProfileStep'][
+                            device_name][step_idx][event_type][
+                                'total_time'] += events_data['total_time']
+                        self.merged_events_per_stage['ProfileStep'][
+                            device_name][step_idx][event_type][
+                                'times'] = merge_ranges(
+                                    self.merged_events_per_stage['ProfileStep']
+                                    [device_name][step_idx][event_type]
+                                    ['times'],
+                                    events_data['times'],
+                                    is_sorted=True)
+
+        # add gpu time for model perspective summary
+        for stage_name, stage_data in self.merged_events_per_stage.items():
+            for device_name, steps_data in stage_data.items():
+                for step_idx, events in steps_data.items():
+                    if 'Kernel' in events:
+                        if step_idx == 'ALL':
+                            self.model_perspective_items[
+                                stage_name].set_gpu_time(
+                                    events['Kernel']['total_time'])
+                            continue
+                        self.model_perspective_items[stage_name].add_gpu_time(
+                            events['Kernel']['total_time'])
+                        self.model_perspective_items[stage_name].gpu_times[
+                            step_idx] = events['Kernel']['total_time']
+
+        if self.has_device:
+            self.gpu_ulitization = self.merged_events_per_stage['ProfileStep'][
+                'GPU']['ALL']['Kernel'][
+                    'total_time'] / self.model_perspective_items[
+                        'ProfileStep'].cpu_time
+
+    def _fill_stage_events(  # noqa: C901
+            self, node, stage_idx, should_recursive=True):
+        if node.type == 'Forward':
+            stage_name = 'Forward'
+            self.has_forward = True
+        elif node.type == 'Backward':
+            stage_name = 'Backward'
+        elif node.type == 'Optimization':
+            stage_name = 'Optimization'
+        elif node.type == 'Dataloader':
+            stage_name = 'Dataloader'
+        else:
+            stage_name = 'Other'
+
+        if should_recursive:
+            stack = []
+            if node.type in StageType:
+                for children in node.children_node:
+                    stack.append(children)
+            else:
+                stack.append(node)
+            while stack:
+                current_node = stack.pop()
+                for childnode in current_node.children_node:
+                    stack.append(childnode)
+                for runtimenode in current_node.runtime_node:
+                    self.events_per_stage[stage_name]["CPU"][stage_idx][
+                        runtimenode.thread_id][
+                            runtimenode.type]['events'].append(runtimenode)
+                    self.events_per_stage[stage_name]["CPU"][stage_idx][
+                        runtimenode.thread_id][
+                            runtimenode.type]['times'].append(
+                                (runtimenode.start_ns, runtimenode.end_ns))
+                    self.events_per_stage[stage_name]["CPU"]['ALL'][
+                        runtimenode.thread_id][
+                            runtimenode.type]['events'].append(runtimenode)
+                    self.events_per_stage[stage_name]["CPU"]['ALL'][
+                        runtimenode.thread_id][
+                            runtimenode.type]['times'].append(
+                                (runtimenode.start_ns, runtimenode.end_ns))
+                    for devicenode in runtimenode.device_node:
+                        self.has_device = True
+                        self.events_per_stage[stage_name]["GPU"][stage_idx][
+                            devicenode.stream_id][
+                                devicenode.type]['events'].append(devicenode)
+                        self.events_per_stage[stage_name]["GPU"][stage_idx][
+                            devicenode.stream_id][
+                                devicenode.type]['times'].append(
+                                    (devicenode.start_ns, devicenode.end_ns))
+                        self.events_per_stage[stage_name]["GPU"]['ALL'][
+                            devicenode.stream_id][
+                                devicenode.type]['events'].append(devicenode)
+                        self.events_per_stage[stage_name]["GPU"]['ALL'][
+                            devicenode.stream_id][
+                                devicenode.type]['times'].append(
+                                    (devicenode.start_ns, devicenode.end_ns))
+                if current_node.type == 'Forward' or current_node.type == 'UserDefined':
+                    continue
+                node_type = current_node.type
+                if node_type == 'PythonUserDefined':
+                    node_type = 'UserDefined'
+                self.events_per_stage[stage_name]["CPU"][stage_idx][
+                    current_node.thread_id][node_type]['events'].append(
+                        current_node)
+                self.events_per_stage[stage_name]["CPU"][stage_idx][
+                    current_node.thread_id][node_type]['times'].append(
+                        (current_node.start_ns, current_node.end_ns))
+                self.events_per_stage[stage_name]["CPU"]['ALL'][
+                    current_node.thread_id][node_type]['events'].append(
+                        current_node)
+                self.events_per_stage[stage_name]["CPU"]['ALL'][
+                    current_node.thread_id][node_type]['times'].append(
+                        (current_node.start_ns, current_node.end_ns))
+
+        else:
+            for runtimenode in node.runtime_node:
+                self.events_per_stage[stage_name]["CPU"][stage_idx][
+                    runtimenode.thread_id][runtimenode.type]['events'].append(
+                        runtimenode)
+                self.events_per_stage[stage_name]["CPU"][stage_idx][
+                    runtimenode.thread_id][runtimenode.type]['times'].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                self.events_per_stage[stage_name]["CPU"]['ALL'][
+                    runtimenode.thread_id][runtimenode.type]['events'].append(
+                        runtimenode)
+                self.events_per_stage[stage_name]["CPU"]['ALL'][
+                    runtimenode.thread_id][runtimenode.type]['times'].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                for devicenode in runtimenode.device_node:
+                    self.has_device = True
+                    self.events_per_stage[stage_name]["GPU"][stage_idx][
+                        devicenode.stream_id][
+                            devicenode.type]['events'].append(devicenode)
+                    self.events_per_stage[stage_name]["GPU"][stage_idx][
+                        devicenode.stream_id][devicenode.type]['times'].append(
+                            (devicenode.start_ns, devicenode.end_ns))
+                    self.events_per_stage[stage_name]["GPU"]['ALL'][
+                        devicenode.stream_id][
+                            devicenode.type]['events'].append(devicenode)
+                    self.events_per_stage[stage_name]["GPU"]['ALL'][
+                        devicenode.stream_id][devicenode.type]['times'].append(
+                            (devicenode.start_ns, devicenode.end_ns))
+
+    def _parse_events(self, nodetrees):
+        node_wrapped_trees = rebuild_node_trees(nodetrees)
+        node_wrapped_threadlist = traverse_tree(node_wrapped_trees)
+        # analyse user-defined summary
+        for threadid, wrapped_nodes in node_wrapped_threadlist.items():
+            for wrapped_node in wrapped_nodes[1:]:  # skip root node
+                if wrapped_node.type == 'PythonUserDefined':
+                    self.add_userdefined_item(wrapped_node)
+
+        # analyse all events in per stage
+        thread_count = 0
+        for threadid, root_wrapped_node in node_wrapped_trees.items():
+            thread_count += 1
+            wrapped_profiler_step_nodes = []
+            for wrapped_node in root_wrapped_node.children_node:
+                wrapped_profiler_step_nodes.append(wrapped_node)
+            self.stage_nums = 0
+            current_stage_idx = None
+            for wrapped_profiler_step_node in wrapped_profiler_step_nodes:
+                if wrapped_profiler_step_node.type == 'ProfileStep':
+                    self.process_id = wrapped_profiler_step_node.process_id
+                    stage_idx = wrapped_profiler_step_node.name.split('#')[1]
+                    total_time = 0
+                    accumulated_stage_time = 0
+                    if thread_count == 1:
+                        self.add_model_perspective_item(
+                            wrapped_profiler_step_node)
+                        self.model_perspective_items['ProfileStep'].cpu_times[
+                            stage_idx] = wrapped_profiler_step_node.cpu_time
+                        total_time = wrapped_profiler_step_node.cpu_time
+                    self.stage_nums += 1
+                    for stage_wrapped_node in wrapped_profiler_step_node.children_node:
+                        if thread_count == 1:
+                            self.add_model_perspective_item(stage_wrapped_node)
+                            if stage_wrapped_node.type in StageType:
+                                self.model_perspective_items[
+                                    stage_wrapped_node.type].cpu_times[
+                                        stage_idx] = stage_wrapped_node.cpu_time
+                            if stage_wrapped_node.type in StageType:
+                                accumulated_stage_time += stage_wrapped_node.cpu_time
+                        self._fill_stage_events(stage_wrapped_node, stage_idx)
+                    if 'Other' not in self.model_perspective_items:
+                        self.model_perspective_items[
+                            'Other'] = ModelPerspectiveItem('Other')
+                    if thread_count == 1:
+                        self.model_perspective_items['Other'].add_cpu_time(
+                            total_time - accumulated_stage_time)
+                        self.model_perspective_items['Other'].cpu_times[
+                            stage_idx] = total_time - accumulated_stage_time
+                    self._fill_stage_events(
+                        wrapped_profiler_step_node,
+                        stage_idx,
+                        should_recursive=False)
+                else:
+                    self._fill_stage_events(wrapped_profiler_step_node,
+                                            current_stage_idx)
+            self._fill_stage_events(
+                root_wrapped_node, current_stage_idx, should_recursive=False)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[userdefined_node.name] = GeneralItem(
+                userdefined_node.name)
+        self.userdefined_items[userdefined_node.name].add_item(
+            userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == 'Forward':
+            name = 'Forward'
+        elif model_perspective_node.type == 'Backward':
+            name = 'Backward'
+        elif model_perspective_node.type == 'Optimization':
+            name = 'Optimization'
+        elif model_perspective_node.type == 'Dataloader':
+            name = 'Dataloader'
+        elif model_perspective_node.type == 'ProfileStep':
+            name = 'ProfileStep'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = ModelPerspectiveItem(name)
+        self.model_perspective_items[name].add_cpu_time(
+            model_perspective_node.cpu_time)
--- a/visualdl/component/profiler/parser/trace_parser.py
+++ b/visualdl/component/profiler/parser/trace_parser.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+class TraceParser:
+    def __init__(self):
+        pass
+
+    def parse(self, content):
+        self.content = content
--- a/visualdl/component/profiler/parser/utils.py
+++ b/visualdl/component/profiler/parser/utils.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+StageType = ['Dataloader', 'Forward', 'Backward', 'Optimization']
+
+
+def sum_ranges(ranges):
+    result = 0
+    for time_range in ranges:
+        result += (time_range[1] - time_range[0])
+    return result
+
+
+def merge_self_ranges(src_ranges, is_sorted=False):
+    merged_ranges = []
+    if len(src_ranges) > 0:
+        if not is_sorted:
+            src_ranges.sort(key=lambda x: x[0])
+        cur_indx = 0
+        merged_ranges.append((src_ranges[cur_indx][0],
+                              src_ranges[cur_indx][1]))
+        for cur_indx in range(1, len(src_ranges)):
+            if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
+                if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0],
+                                         src_ranges[cur_indx][1])
+                else:
+                    merged_ranges.append((src_ranges[cur_indx][0],
+                                          src_ranges[cur_indx][1]))
+    return merged_ranges
+
+
+def merge_ranges(range_list1, range_list2, is_sorted=False):  # noqa:C901
+    merged_ranges = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    if len1 == 0 and len2 == 0:
+        return merged_ranges
+    elif len1 == 0:
+        return range_list2
+    elif len2 == 0:
+        return range_list1
+    else:
+        indx1 = 0
+        indx2 = 0
+        range1 = range_list1[indx1]
+        range2 = range_list2[indx2]
+        if range1[0] < range2[0]:
+            merged_ranges.append(range1)
+            indx1 += 1
+        else:
+            merged_ranges.append(range2)
+            indx2 += 1
+        while indx1 < len1 and indx2 < len2:
+            range1 = range_list1[indx1]
+            range2 = range_list2[indx2]
+            if range1[0] < range2[0]:
+                if range1[1] > merged_ranges[-1][1]:
+                    if range1[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                    else:
+                        merged_ranges.append((range1[0], range1[1]))
+                    indx1 += 1
+                else:
+                    indx1 += 1
+            else:
+                if range2[1] > merged_ranges[-1][1]:
+                    if range2[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                    else:
+                        merged_ranges.append((range2[0], range2[1]))
+                    indx2 += 1
+                else:
+                    indx2 += 1
+
+        while indx1 < len1:
+            range1 = range_list1[indx1]
+            if range1[1] > merged_ranges[-1][1]:
+                if range1[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                else:
+                    merged_ranges.append((range1[0], range1[1]))
+                indx1 += 1
+            else:
+                indx1 += 1
+        while indx2 < len2:
+            range2 = range_list2[indx2]
+            if range2[1] > merged_ranges[-1][1]:
+                if range2[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                else:
+                    merged_ranges.append((range2[0], range2[1]))
+                indx2 += 1
+            else:
+                indx2 += 1
+    return merged_ranges
+
+
+def intersection_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if len(range_list1) == 0 or len(range_list2) == 0:
+        return result_range
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+    while indx1 < len1 and indx2 < len2:
+        if range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            assert (range2[1] > range1[0])
+            result_range.append((range1[0], range2[1]))
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            result_range.append(range1)
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        elif range2[1] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append(range2)
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] < range1[1]:
+            assert (range2[1] >= range1[1])
+            result_range.append((range2[0], range1[1]))
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        else:
+            assert (range2[0] >= range1[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+    return result_range
+
+
+def subtract_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    if len(range_list1) == 0:
+        return result_range
+    if len(range_list2) == 0:
+        return range_list1
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+
+    while indx1 < len(range_list1):
+        if indx2 == len(range_list2):
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+        elif range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+        elif range2[0] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append((range1[0], range2[0]))
+            range1 = (range2[0], range1[1])
+        else:
+            assert (range2[0] >= range1[1])
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+    return result_range
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0  # kernel time
+        self.self_gpu_time = 0
+        self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
+        self.self_general_gpu_time = 0
+        self.is_terminal_operator_node = True
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+            if child.is_terminal_operator_node is False:
+                self.is_terminal_operator_node = False
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        self.self_cpu_time = self.cpu_time
+        for child in self.children_node:
+            if child.type == 'Operator':
+                self.is_terminal_operator_node = False
+            self.gpu_time += child.gpu_time
+            self.general_gpu_time += child.general_gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+            self.general_gpu_time += rt.general_gpu_time
+            self.self_general_gpu_time += rt.general_gpu_time
+        for device in self.hostnode.device_node:
+            if device.type == 'Kernel':
+                self.gpu_time += (device.end_ns - device.start_ns)
+                self.self_gpu_time += (device.end_ns - device.start_ns)
+            self.general_gpu_time += (device.end_ns - device.start_ns)
+            self.self_general_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def get_device_nodes(hostnode):
+    '''
+    Get all device nodes called in the time range of hostnode.
+    '''
+    stack = []
+    device_nodes = []
+    stack.append(hostnode)
+    while stack:
+        current_node = stack.pop()
+        for childnode in current_node.children_node:
+            stack.append(childnode)
+        for runtimenode in current_node.runtime_node:
+            for devicenode in runtimenode.device_node:
+                device_nodes.append(devicenode)
+    return device_nodes
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+def rebuild_node_trees(nodetrees):  # noqa:C901
+    template_root = None
+    # First, we find the tree which includes Forward event.
+    for threadid, root in nodetrees.items():
+        has_find_template_root = False
+        template_root = HostStatisticNode(root)
+        for children in root.children_node:
+            if children.type == 'ProfileStep':
+                profiler_step_node = HostStatisticNode(children)
+                template_root.children_node.append(profiler_step_node)
+                has_find_template_root = True
+                for stage_node in children.children_node:
+                    if stage_node.type in StageType:
+                        profiler_step_node.children_node.append(
+                            HostStatisticNode(stage_node))
+            else:
+                break
+        if has_find_template_root is True:
+            break
+
+    if template_root is None:
+        print('No profiler steps found, overview page will have no data.')
+
+    wrapped_tree = {}
+    for thread_id, rootnode in nodetrees.items():
+        has_find_template_root = False
+        for children in rootnode.children_node:
+            if children.type == 'ProfileStep':
+                has_find_template_root = True
+                break
+
+        unwrapped_stack = []
+        warpped_stack = []
+
+        root_statistic_node = HostStatisticNode(rootnode)
+        wrapped_tree[thread_id] = root_statistic_node
+        if has_find_template_root is False:
+            for profiler_step_node in template_root.children_node:
+                profiler_step_wrap_node = HostStatisticNode(
+                    profiler_step_node.hostnode)
+                root_statistic_node.children_node.append(
+                    profiler_step_wrap_node)
+                for stage_node in profiler_step_node.children_node:
+                    stage_wrap_node = HostStatisticNode(stage_node.hostnode)
+                    profiler_step_wrap_node.children_node.append(
+                        stage_wrap_node)
+            # insert nodes in original root into new stage nodes
+            # algorithm: post order traversal the tree
+            stack = []
+            flag_stack = []
+            post_order_nodes = []
+            stack.append(root_statistic_node)
+            flag_stack.append(0)
+            while stack:
+                current_node = stack.pop()
+                flag = flag_stack.pop()
+                if flag == 0:
+                    stack.append(current_node)
+                    flag_stack.append(1)
+                    for children_node in reversed(current_node.children_node):
+                        stack.append(children_node)
+                        flag_stack.append(0)
+                else:
+                    post_order_nodes.append(current_node)
+            # traverse post_order_nodes and insert right position
+            for runtimenode in rootnode.runtime_node:
+                runtime_wrapped_node = HostStatisticNode(runtimenode)
+                root_statistic_node.runtime_node.append(runtime_wrapped_node)
+            for node in rootnode.children_node:
+                unwrapped_stack.append(node)
+                for wrapped_node in post_order_nodes:
+                    if node.start_ns >= wrapped_node.start_ns and node.end_ns <= wrapped_node.end_ns:
+                        child_wrapped_node = HostStatisticNode(node)
+                        warpped_stack.append(child_wrapped_node)
+                        wrapped_node.children_node.append(child_wrapped_node)
+                        break
+        else:
+            unwrapped_stack.append(rootnode)
+            warpped_stack.append(root_statistic_node)
+        while unwrapped_stack:
+            current_node = unwrapped_stack.pop()
+            current_wrapped_node = warpped_stack.pop()
+            for childnode in current_node.children_node:
+                unwrapped_stack.append(childnode)
+                child_wrapped_node = HostStatisticNode(childnode)
+                current_wrapped_node.children_node.append(child_wrapped_node)
+                warpped_stack.append(child_wrapped_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_wrapped_node = HostStatisticNode(runtimenode)
+                current_wrapped_node.runtime_node.append(runtime_wrapped_node)
+
+    # recursive calculate node statistic values
+    for thread_id, root_wrapped_node in wrapped_tree.items():
+        root_wrapped_node.cal_statistic()
+    return wrapped_tree
+
+
+def format_time(time, unit='ms', inf_subs='-'):
+    r"""
+    Transform time in ns to time in unit.
+    """
+    if time == float('inf'):
+        return inf_subs
+    else:
+        result = float(time)
+        if unit == 's':
+            result /= 1e9
+        elif unit == 'ms':
+            result /= 1e6
+        elif unit == 'us':
+            result /= 1e3
+        # return '{:.2f}'.format(result)
+        return round(result, 2)
+
+
+def format_ratio(ratio):
+    r"""
+    Transform ratio within [0, 1] to percentage presentation.
+    """
+    # return '{:.2f}'.format(ratio * 100)
+    return round(ratio * 100, 2)
+
+
+def format_float(float_data):
+    return round(float_data, 2)
+
+
+def format_memory(memory, memory_unit='KB'):
+    result = float(memory)
+    if memory_unit == 'GB':
+        result /= (1024 * 1024 * 1024)
+    elif memory_unit == 'MB':
+        result /= (1024 * 1024)
+    elif memory_unit == 'KB':
+        result /= 1024
+    # return '{:.2f}'.format(result)
+    return round(result, 2)
--- a/visualdl/component/profiler/profiler_data.py
+++ b/visualdl/component/profiler/profiler_data.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+from collections import defaultdict
+from collections import OrderedDict
+
+from .parser.distributed_parser import DistributedParser
+from .parser.kernel_parser import KernelParser
+from .parser.memory_parser import MemoryParser
+from .parser.operator_parser import OperatorParser
+from .parser.overview_parser import CPUType
+from .parser.overview_parser import GPUType
+from .parser.overview_parser import OverviewParser
+from .parser.trace_parser import TraceParser
+from .parser.utils import format_float
+from .parser.utils import format_memory
+from .parser.utils import format_ratio
+from .parser.utils import format_time
+from .parser.utils import traverse_tree
+
+
+def filter_type(node_trees):
+    nodelists = traverse_tree(node_trees)
+    for thread_id, nodelist in nodelists.items():
+        for node in nodelist:
+            if not isinstance(node.type, str):
+                node.type = str(node.type).split('.')[1]
+
+
+class ProfilerData:
+    '''
+  Hold all parsed data to serve for user requests.
+  '''
+
+    def __init__(self, run, worker_name, span_indx, profiler_result):
+        self.run = run
+        self.worker_name = worker_name
+        self.span_indx = span_indx
+        self.node_trees = profiler_result.get_data()
+        filter_type(self.node_trees)
+        self.extra_infos = profiler_result.get_extra_info()
+        self.span_idx = profiler_result.get_span_idx()
+        self.device_infos = profiler_result.get_device_infos()
+        self.overview_parser = None
+        self.operator_parser = None
+        self.distributed_parser = None
+        self.memory_parser = None
+        self.kernel_parser = None
+        self.trace_parser = None
+        self.has_gpu = profiler_result.has_device()
+
+        if profiler_result.has_host():
+            # overview parser
+            self.overview_parser = OverviewParser()
+            self.overview_parser.parse(self.node_trees)
+            self.merged_events_per_stage = self.overview_parser.merged_events_per_stage
+            self.model_perspective_items = self.overview_parser.model_perspective_items
+            self.userdefined_items = self.overview_parser.userdefined_items
+            self.gpu_ulitization = self.overview_parser.gpu_ulitization
+            self.process_id = self.overview_parser.process_id
+            # operator parser
+            self.operator_parser = OperatorParser()
+            self.operator_parser.parse(self.node_trees)
+            self.operator_items = self.operator_parser.items
+            self.operator_items_with_input_shape = self.operator_parser.items_with_input_shape
+
+            # distributed parser
+            if profiler_result.has_device():
+                self.distributed_parser = DistributedParser()
+                self.distributed_parser.parse(self.node_trees)
+                self.distributed_time = self.distributed_parser.steps_time
+
+            if profiler_result.has_memory():
+                # memory parser
+                self.memory_parser = MemoryParser()
+                self.memory_parser.parse(self.node_trees)
+                self.memory_curve = self.memory_parser.memory_curve
+                self.allocated_items = self.memory_parser.allocated_items
+                self.reserved_items = self.memory_parser.reserved_items
+                self.paired_events = self.memory_parser.paired_events
+                self.size_ranges = self.memory_parser.size_ranges
+                self.peak_allocation_values = self.memory_parser.peak_allocation_values
+
+        if profiler_result.has_device():
+            # kernel parser
+            self.kernel_parser = KernelParser()
+            self.kernel_parser.parse(traverse_tree(self.node_trees))
+            self.kernel_items = self.kernel_parser.kernel_items
+            self.kernel_items_with_op_name_attributes = self.kernel_parser.kernel_items_with_op_name_attributes
+            self.occupancy = self.kernel_parser.occupancy
+            self.sm_efficiency = self.kernel_parser.sm_efficiency
+            self.tensorcore_ratio = self.kernel_parser.tensor_core_ratio
+            self.gpu_ids = self.kernel_parser.gpu_ids
+
+        # trace parser
+        self.trace_parser = TraceParser()
+        self.trace_parser.parse(profiler_result.content)
+
+    def get_views(self):
+        '''
+        Return available views this profile data can provide.
+        '''
+        views = []
+        if self.overview_parser:
+            if self.overview_parser.has_forward:
+                views.append('Overview')
+        if self.operator_parser:
+            if self.operator_items:
+                views.append('Operator')
+        if self.kernel_parser:
+            if self.kernel_items:
+                views.append('GPU Kernel')
+        if self.memory_parser:
+            if self.memory_curve:
+                views.append('Memory')
+        if self.distributed_parser:
+            if self.distributed_time:
+                views.append('Distributed')
+        views.append('Trace')
+        return views
+
+    def get_device_infos(self):
+        if not self.overview_parser.has_device:
+            device_type = 'CPU'
+            return {
+                "device_type": device_type,
+                "CPU": {
+                    "process_utilization":
+                    format_ratio(
+                        float(self.extra_infos["Process Cpu Utilization"])),
+                    "system_utilization":
+                    format_ratio(
+                        float(self.extra_infos["System Cpu Utilization"]))
+                }
+            }
+        else:
+            device_type = 'GPU'
+            gpu_id = int(next(iter(self.gpu_ids)))
+            return {
+                "device_type": device_type,
+                "CPU": {
+                    "process_utilization":
+                    format_ratio(
+                        float(self.extra_infos["Process Cpu Utilization"])),
+                    "system_utilization":
+                    format_ratio(
+                        float(self.extra_infos["System Cpu Utilization"]))
+                },
+                "GPU": {
+                    "name":
+                    self.device_infos[gpu_id]['name'],
+                    "memory":
+                    "{} GB".format(
+                        format_memory(
+                            self.device_infos[gpu_id]['totalGlobalMem'],
+                            'GB')),
+                    "compute_capability":
+                    '{}.{}'.format(self.device_infos[gpu_id]['computeMajor'],
+                                   self.device_infos[gpu_id]['computeMinor']),
+                    "utilization":
+                    format_ratio(self.gpu_ulitization),
+                    "sm_efficiency":
+                    format_ratio(
+                        self.sm_efficiency /
+                        self.model_perspective_items['ProfileStep'].cpu_time),
+                    "achieved_occupancy":
+                    format_ratio(self.occupancy),
+                    "tensor_core_percentage":
+                    format_ratio(self.tensorcore_ratio)
+                }
+            }
+
+    def get_model_perspective(self, time_unit):
+        '''
+        Get total cpu and gpu statistics for model perspective of each profiler step.
+        '''
+        data = OrderedDict()
+        data['column_name'] = [
+            "name", "calls", "total_time", "avg_time", "max_time", "min_time",
+            "ratio"
+        ]
+        data['cpu'] = []
+        if self.overview_parser.has_device:
+            data['gpu'] = []
+        total_cpu_time = self.model_perspective_items['ProfileStep'].cpu_time
+        total_gpu_time = self.model_perspective_items['ProfileStep'].gpu_time
+        for stage_name in [
+                'ProfileStep', 'Dataloader', 'Forward', 'Backward',
+                'Optimization', 'Other'
+        ]:
+            if stage_name in self.model_perspective_items:
+                cpu_stage_data = OrderedDict()
+                cpu_stage_data['name'] = stage_name
+                cpu_stage_data['calls'] = self.model_perspective_items[
+                    stage_name].call
+                cpu_stage_data['total_time'] = format_time(
+                    self.model_perspective_items[stage_name].cpu_time,
+                    time_unit)
+                cpu_stage_data['avg_time'] = format_time(
+                    self.model_perspective_items[stage_name].avg_cpu_time,
+                    time_unit)
+                cpu_stage_data['max_time'] = format_time(
+                    self.model_perspective_items[stage_name].max_cpu_time,
+                    time_unit)
+                cpu_stage_data['min_time'] = format_time(
+                    self.model_perspective_items[stage_name].min_cpu_time,
+                    time_unit,
+                    inf_subs=0)
+                cpu_stage_data['ratio'] = format_ratio(
+                    self.model_perspective_items[stage_name].cpu_time /
+                    total_cpu_time)
+                if self.overview_parser.has_device:
+                    gpu_stage_data = OrderedDict()
+                    gpu_stage_data['name'] = stage_name
+                    gpu_stage_data['calls'] = self.model_perspective_items[
+                        stage_name].call
+                    gpu_stage_data['total_time'] = format_time(
+                        self.model_perspective_items[stage_name].gpu_time,
+                        time_unit)
+                    gpu_stage_data['avg_time'] = format_time(
+                        self.model_perspective_items[stage_name].avg_gpu_time,
+                        time_unit)
+                    gpu_stage_data['max_time'] = format_time(
+                        self.model_perspective_items[stage_name].max_gpu_time,
+                        time_unit)
+                    gpu_stage_data['min_time'] = format_time(
+                        self.model_perspective_items[stage_name].min_gpu_time,
+                        time_unit,
+                        inf_subs=0)
+                    gpu_stage_data['ratio'] = format_ratio(
+                        self.model_perspective_items[stage_name].gpu_time /
+                        total_gpu_time)
+                data['cpu'].append(cpu_stage_data)
+                if self.overview_parser.has_device:
+                    data['gpu'].append(gpu_stage_data)
+        return data
+
+    def get_model_perspective_perstep(self, device_type, time_unit):
+        try:
+            data = OrderedDict()
+            data['order'] = []
+            steps = [
+                int(step_id) for step_id in
+                self.model_perspective_items['ProfileStep'].cpu_times.keys()
+            ]
+            steps = sorted(steps)
+            data['steps'] = steps
+            for stage_name in [
+                    'Dataloader', 'Forward', 'Backward', 'Optimization',
+                    'Other'
+            ]:
+                if stage_name not in self.model_perspective_items:
+                    continue
+                data['order'].append(stage_name)
+                data[stage_name] = []
+                for stage_idx in steps:
+                    stage_idx = str(stage_idx)
+                    if device_type == 'cpu':
+                        if stage_idx in self.model_perspective_items[
+                                stage_name].cpu_times:
+                            data[stage_name].append(
+                                format_time(
+                                    self.model_perspective_items[stage_name].
+                                    cpu_times[stage_idx], time_unit))
+                        else:
+                            data[stage_name].append(0)
+                    else:
+                        if stage_idx in self.model_perspective_items[
+                                stage_name].gpu_times:
+                            data[stage_name].append(
+                                format_time(
+                                    self.model_perspective_items[stage_name].
+                                    gpu_times[stage_idx], time_unit))
+                        else:
+                            data[stage_name].append(0)
+        except Exception as e:
+            print('error in get_model_perspective_perstep', e)
+        new_data = {}
+        new_data['order'] = data['order']
+        new_data['steps'] = data['steps']
+        new_data['data'] = []
+        for name in new_data['order']:
+            new_data['data'].append(data[name])
+        return new_data
+
+    def get_event_type_perspective(self, device_type, time_unit):
+        data = OrderedDict()
+        data['order'] = []
+        if device_type == 'cpu':
+            for event_type in CPUType:
+                event_type_data = {}
+                event_type_data['calling_times'] = {}
+                event_type_data['calling_times']['key'] = []
+                event_type_data['calling_times']['value'] = []
+                event_type_data['durations'] = {}
+                event_type_data['durations']['key'] = []
+                event_type_data['durations']['value'] = []
+                event_type_data['ratios'] = {}
+                event_type_data['ratios']['key'] = []
+                event_type_data['ratios']['value'] = []
+                for stage_name in [
+                        'Dataloader', 'Forward', 'Backward', 'Optimization',
+                        'Other'
+                ]:
+                    if stage_name in self.merged_events_per_stage:
+                        if event_type in self.merged_events_per_stage[
+                                stage_name]['CPU']['ALL']:
+                            event_type_data['calling_times']['key'].append(
+                                stage_name)
+                            event_type_data['durations']['key'].append(
+                                stage_name)
+                            event_type_data['ratios']['key'].append(stage_name)
+                            event_type_data['calling_times']['value'].append(
+                                self.merged_events_per_stage[stage_name]['CPU']
+                                ['ALL'][event_type]['calls'])
+                            event_type_data['durations']['value'].append(
+                                format_time(
+                                    self.merged_events_per_stage[stage_name]
+                                    ['CPU']['ALL'][event_type]['total_time'],
+                                    time_unit))
+                            event_type_data['ratios']['value'].append(
+                                format_ratio(
+                                    self.merged_events_per_stage[stage_name]
+                                    ['CPU']['ALL'][event_type]['total_time'] /
+                                    self.merged_events_per_stage['ProfileStep']
+                                    ['CPU']['ALL'][event_type]['total_time']))
+                if event_type_data['calling_times']['key']:
+                    data[event_type] = event_type_data
+                    data['order'].append(event_type)
+        else:
+            for event_type in GPUType:
+                event_type_data = {}
+                event_type_data['calling_times'] = {}
+                event_type_data['calling_times']['key'] = []
+                event_type_data['calling_times']['value'] = []
+                event_type_data['durations'] = {}
+                event_type_data['durations']['key'] = []
+                event_type_data['durations']['value'] = []
+                event_type_data['ratios'] = {}
+                event_type_data['ratios']['key'] = []
+                event_type_data['ratios']['value'] = []
+                for stage_name in [
+                        'Dataloader', 'Forward', 'Backward', 'Optimization',
+                        'Other'
+                ]:
+                    if stage_name in self.merged_events_per_stage:
+                        if event_type in self.merged_events_per_stage[
+                                stage_name]['GPU']['ALL']:
+                            event_type_data['calling_times']['key'].append(
+                                stage_name)
+                            event_type_data['durations']['key'].append(
+                                stage_name)
+                            event_type_data['ratios']['key'].append(stage_name)
+                            event_type_data['calling_times']['value'].append(
+                                self.merged_events_per_stage[stage_name]['GPU']
+                                ['ALL'][event_type]['calls'])
+                            event_type_data['durations']['value'].append(
+                                format_time(
+                                    self.merged_events_per_stage[stage_name]
+                                    ['GPU']['ALL'][event_type]['total_time'],
+                                    time_unit))
+                            event_type_data['ratios']['value'].append(
+                                format_ratio(
+                                    self.merged_events_per_stage[stage_name]
+                                    ['GPU']['ALL'][event_type]['total_time'] /
+                                    self.merged_events_per_stage['ProfileStep']
+                                    ['GPU']['ALL'][event_type]['total_time']))
+                if event_type_data['calling_times']['key']:
+                    data[event_type] = event_type_data
+                    data['order'].append(event_type)
+        return data
+
+    def get_event_type_model_perspective(self, time_unit):  # noqa: C901
+        data = OrderedDict()
+        data['order'] = []
+        data['phase_type'] = []
+        try:
+            for event_type in CPUType:
+                if event_type in self.merged_events_per_stage['ProfileStep'][
+                        'CPU']['ALL']:
+                    data['order'].append(event_type)
+                    data[event_type] = []
+            if self.overview_parser.has_device:
+                for event_type in GPUType:
+                    if event_type in self.merged_events_per_stage[
+                            'ProfileStep']['GPU']['ALL']:
+                        data['order'].append(event_type)
+                        data[event_type] = []
+            for stage_name in [
+                    'ProfileStep', 'Dataloader', 'Forward', 'Backward',
+                    'Optimization', 'Other'
+            ]:
+                if stage_name in self.merged_events_per_stage:
+                    data['phase_type'].append(stage_name)
+                    for event_type in data['order']:
+                        if event_type in CPUType:
+                            if event_type in self.merged_events_per_stage[
+                                    stage_name]['CPU']['ALL']:
+                                data[event_type].append(
+                                    format_time(
+                                        self.merged_events_per_stage[
+                                            stage_name]['CPU']['ALL']
+                                        [event_type]['total_time'], time_unit))
+                            else:
+                                data[event_type].append(0)
+                        elif event_type in GPUType:
+                            if event_type in self.merged_events_per_stage[
+                                    stage_name]['GPU']['ALL']:
+                                data[event_type].append(
+                                    format_time(
+                                        self.merged_events_per_stage[
+                                            stage_name]['GPU']['ALL']
+                                        [event_type]['total_time'], time_unit))
+                            else:
+                                data[event_type].append(0)
+            newdata = OrderedDict()
+            newdata['order'] = data['order']
+            newdata['phase_type'] = data['phase_type']
+            newdata['data'] = []
+            for key in newdata['order']:
+                newdata['data'].append(data[key])
+
+        except Exception as e:
+            print('error in get_event_type_model_perspective', e)
+        return newdata
+
+    def get_userdefined_perspective(self, time_unit):
+        data = OrderedDict()
+        if self.overview_parser.has_device:
+            data['column_name'] = [
+                'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                'cpu_max_time', 'cpu_min_time', 'cpu_ratio', 'gpu_total_time',
+                'gpu_avg_time', 'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
+            ]
+            data['has_gpu'] = True
+        else:
+            data['column_name'] = [
+                'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
+            ]
+            data['has_gpu'] = False
+        data['events'] = []
+
+        total_cpu_time = 0
+        total_gpu_time = 0
+        for name, event in self.userdefined_items.items():
+            total_cpu_time += event.cpu_time
+            total_gpu_time += event.general_gpu_time
+        for name, event in self.userdefined_items.items():
+            if self.overview_parser.has_device:
+                data['events'].append({
+                    "name":
+                    name,
+                    "calls":
+                    event.call,
+                    "cpu_total_time":
+                    format_time(event.cpu_time, time_unit),
+                    "cpu_avg_time":
+                    format_time(event.avg_cpu_time, time_unit),
+                    "cpu_max_time":
+                    format_time(event.max_cpu_time, time_unit),
+                    "cpu_min_time":
+                    format_time(event.min_cpu_time, time_unit),
+                    "cpu_ratio":
+                    format_ratio(event.cpu_time / total_cpu_time
+                                 if total_cpu_time != 0 else 0.0),
+                    "gpu_total_time":
+                    format_time(event.general_gpu_time, time_unit),
+                    "gpu_avg_time":
+                    format_time(event.avg_general_gpu_time, time_unit),
+                    "gpu_max_time":
+                    format_time(event.max_general_gpu_time, time_unit),
+                    "gpu_min_time":
+                    format_time(event.min_general_gpu_time, time_unit),
+                    "gpu_ratio":
+                    format_ratio(event.general_gpu_time / total_gpu_time
+                                 if total_gpu_time != 0 else 0.0)
+                })
+            else:
+                data['events'].append({
+                    "name":
+                    name,
+                    "calls":
+                    event.call,
+                    "cpu_total_time":
+                    format_time(event.cpu_time, time_unit),
+                    "cpu_avg_time":
+                    format_time(event.avg_cpu_time, time_unit),
+                    "cpu_max_time":
+                    format_time(event.max_cpu_time, time_unit),
+                    "cpu_min_time":
+                    format_time(event.min_cpu_time, time_unit),
+                    "cpu_ratio":
+                    format_ratio(event.cpu_time / total_cpu_time
+                                 if total_cpu_time != 0 else 0.0),
+                })
+        return data
+
+    def get_operator_pie(self, topk, time_unit='ms'):
+        data = OrderedDict()
+        data['column_name'] = [
+            "name", "calls", "total_time", "avg_time", "max_time", "min_time",
+            "ratio"
+        ]
+        data['cpu'] = []
+        if self.has_gpu:
+            data['gpu'] = []
+            gpu_sorted_items = sorted(
+                self.operator_items.items(),
+                key=lambda x: x[1].general_gpu_time,
+                reverse=True)
+
+        cpu_sorted_items = sorted(
+            self.operator_items.items(),
+            key=lambda x: x[1].cpu_time,
+            reverse=True)
+
+        if topk <= 0:
+            cpu_items = cpu_sorted_items
+            if self.has_gpu:
+                gpu_items = gpu_sorted_items
+        else:
+            cpu_items = cpu_sorted_items[:topk]
+            if self.has_gpu:
+                gpu_items = gpu_sorted_items[:topk]
+        total_cpu_time = 0.0
+        total_gpu_time = 0.0
+        for op_name, item in cpu_items:
+            total_cpu_time += item.cpu_time
+        if self.has_gpu:
+            for op_name, item in gpu_items:
+                total_gpu_time += item.general_gpu_time
+
+        for op_name, item in cpu_items:
+            cpu_stage_data = OrderedDict()
+            cpu_stage_data['name'] = op_name
+            cpu_stage_data['calls'] = item.call
+            cpu_stage_data['total_time'] = format_time(item.cpu_time,
+                                                       time_unit)
+            cpu_stage_data['avg_time'] = format_time(item.avg_cpu_time,
+                                                     time_unit)
+            cpu_stage_data['max_time'] = format_time(item.max_cpu_time,
+                                                     time_unit)
+            cpu_stage_data['min_time'] = format_time(item.min_cpu_time,
+                                                     time_unit)
+            cpu_stage_data['ratio'] = format_ratio(
+                item.cpu_time / total_cpu_time)
+            data['cpu'].append(cpu_stage_data)
+        if self.has_gpu:
+            for op_name, item in gpu_items:
+                gpu_stage_data = OrderedDict()
+                gpu_stage_data['name'] = op_name
+                gpu_stage_data['calls'] = item.call
+                gpu_stage_data['total_time'] = format_time(
+                    item.general_gpu_time, time_unit)
+                gpu_stage_data['avg_time'] = format_time(
+                    item.avg_general_gpu_time, time_unit)
+                gpu_stage_data['max_time'] = format_time(
+                    item.max_general_gpu_time, time_unit)
+                gpu_stage_data['min_time'] = format_time(
+                    item.min_general_gpu_time, time_unit)
+                gpu_stage_data['ratio'] = format_ratio(
+                    item.general_gpu_time / total_gpu_time)
+                data['gpu'].append(gpu_stage_data)
+        return data
+
+    def get_operator_pie_expand(  # noqa: C901
+            self, topk, device_type, time_unit):
+        data = OrderedDict()
+        data['order'] = []
+        data['phase_type'] = []
+        data['data'] = []
+        if device_type == 'cpu':
+            sorted_items = sorted(
+                self.operator_items.items(),
+                key=lambda x: x[1].cpu_time,
+                reverse=True)
+
+        else:
+            sorted_items = sorted(
+                self.operator_items.items(),
+                key=lambda x: x[1].general_gpu_time,
+                reverse=True)
+        if topk <= 0 or topk >= 20:
+            items = sorted_items[:20]
+            other_items = sorted_items[20:]
+        else:
+            items = sorted_items[:topk]
+            other_items = []
+        data['order'].extend(
+            ['infer_shape', 'compute', 'node_creation', 'others'])
+        inner_op_data = defaultdict(list)
+        for op_name, event in items:
+            data['phase_type'].append(op_name)
+            innerop_knownsub_times = 0
+            have_innerop_name = set()
+            for innerop_name, item in event.operator_inners.items():
+                if 'infer_shape' in innerop_name or 'infer_meta' in innerop_name:
+                    innerop_name = 'infer_shape'
+                elif 'compute' in innerop_name:
+                    innerop_name = 'compute'
+                elif 'node_creation' in innerop_name:
+                    innerop_name = 'node_creation'
+                else:
+                    continue
+                have_innerop_name.add(innerop_name)
+                if device_type == 'cpu':
+                    inner_op_data[innerop_name].append(
+                        format_time(item.cpu_time, time_unit))
+                    innerop_knownsub_times += item.cpu_time
+                else:
+                    inner_op_data[innerop_name].append(
+                        format_time(item.general_gpu_time, time_unit))
+                    innerop_knownsub_times += item.general_gpu_time
+            if device_type == 'cpu':
+                inner_op_data['others'].append(
+                    format_time(event.cpu_time - innerop_knownsub_times,
+                                time_unit))
+            else:
+                inner_op_data['others'].append(
+                    format_time(
+                        event.general_gpu_time - innerop_knownsub_times,
+                        time_unit))
+            have_innerop_name.add('others')
+            for innerop_name in data['order']:
+                if innerop_name in have_innerop_name:
+                    continue
+                else:
+                    inner_op_data[innerop_name].append(0)
+        if other_items:
+            innerop_knownsub_times = 0
+            total_event_times = 0
+            data['phase_type'].append('others')
+            others_time = defaultdict(float)
+            for op_name, event in other_items:
+                for innerop_name, item in event.operator_inners.items():
+                    if 'infer_shape' in innerop_name:
+                        innerop_name = 'infer_shape'
+                    elif 'compute' in innerop_name:
+                        innerop_name = 'compute'
+                    elif 'node_creation' in innerop_name:
+                        innerop_name = 'node_creation'
+                    else:
+                        continue
+                    if device_type == 'cpu':
+                        others_time[innerop_name] += item.cpu_time
+                        innerop_knownsub_times += item.cpu_time
+                    else:
+                        others_time[innerop_name] += item.general_gpu_time
+                        innerop_knownsub_times += item.general_gpu_time
+                if device_type == 'cpu':
+                    total_event_times += event.cpu_time
+                else:
+                    total_event_times += event.general_gpu_time
+            others_time['others'] = total_event_times - innerop_knownsub_times
+            for innerop_name in data['order']:
+                if innerop_name not in others_time:
+                    others_time[innerop_name] = 0.0
+                inner_op_data[innerop_name].append(
+                    format_time(others_time[innerop_name], time_unit))
+
+        for innerop_name in data['order']:
+            data['data'].append(inner_op_data[innerop_name])
+        return data
+
+    def get_operator_table(  # noqa: C901  Todo: Optimize code
+            self,
+            group_by='op_name',
+            search_name=None,
+            time_unit='ms'):
+        def get_children_data(event):
+            datas = []
+            for innerop_name, item in event.operator_inners.items():
+
+                if item.cpu_time == 0:
+                    cpu_ratio = 0
+                else:
+                    cpu_ratio = float(item.cpu_time) / event.cpu_time
+                if item.general_gpu_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(
+                        item.general_gpu_time) / event.general_gpu_time
+                data = {
+                    "name":
+                    innerop_name,
+                    "calls":
+                    item.call,
+                    "cpu_total_time":
+                    format_time(item.cpu_time, time_unit),
+                    "cpu_avg_time":
+                    format_time(item.avg_cpu_time, time_unit),
+                    "cpu_max_time":
+                    format_time(item.max_cpu_time, time_unit),
+                    "cpu_min_time":
+                    format_time(item.min_cpu_time, time_unit),
+                    "cpu_ratio":
+                    format_ratio(cpu_ratio),
+                    "gpu_total_time":
+                    format_time(item.general_gpu_time, time_unit),
+                    "gpu_avg_time":
+                    format_time(item.avg_general_gpu_time, time_unit),
+                    "gpu_max_time":
+                    format_time(item.max_general_gpu_time, time_unit),
+                    "gpu_min_time":
+                    format_time(item.min_general_gpu_time, time_unit),
+                    "gpu_ratio":
+                    format_ratio(gpu_ratio)
+                }
+                datas.append(data)
+            return datas
+
+        data = OrderedDict()
+        data['events'] = []
+        total_cpu_time = 0
+        total_gpu_time = 0
+        for name, event in self.operator_items.items():
+            total_cpu_time += event.cpu_time
+            total_gpu_time += event.general_gpu_time
+        if not search_name:
+            if group_by == 'op_name':
+                if self.has_gpu:
+                    data['column_name'] = [
+                        'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                        'cpu_max_time', 'cpu_min_time', 'cpu_ratio',
+                        'gpu_total_time', 'gpu_avg_time', 'gpu_max_time',
+                        'gpu_min_time', 'gpu_ratio'
+                    ]
+                    data['has_gpu'] = True
+                else:
+                    data['column_name'] = [
+                        'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                        'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
+                    ]
+                    data['has_gpu'] = False
+                if self.has_gpu:
+                    sorted_items = sorted(
+                        self.operator_items.items(),
+                        key=lambda x: x[1].general_gpu_time,
+                        reverse=True)
+                else:
+                    sorted_items = sorted(
+                        self.operator_items.items(),
+                        key=lambda x: x[1].cpu_time,
+                        reverse=True)
+                for name, event in sorted_items:
+                    if self.has_gpu:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+                    else:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+
+            else:
+                if self.has_gpu:
+                    data['column_name'] = [
+                        'name', 'calls', 'input_shape', 'cpu_total_time',
+                        'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
+                        'cpu_ratio', 'gpu_total_time', 'gpu_avg_time',
+                        'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
+                    ]
+                    data['has_gpu'] = True
+                else:
+                    data['column_name'] = [
+                        'name', 'calls', 'input_shape', 'cpu_total_time',
+                        'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
+                        'cpu_ratio'
+                    ]
+                    data['has_gpu'] = False
+                new_arrange_data = {}
+                for op_name, items_with_input_shape in self.operator_items_with_input_shape.items(
+                ):
+                    for input_shape, item in items_with_input_shape.items():
+                        new_arrange_data[(op_name, input_shape)] = item
+                if self.has_gpu:
+                    sorted_items = sorted(
+                        new_arrange_data.items(),
+                        key=lambda x: x[1].general_gpu_time,
+                        reverse=True)
+                else:
+                    sorted_items = sorted(
+                        new_arrange_data.items(),
+                        key=lambda x: x[1].cpu_time,
+                        reverse=True)
+                for (name, input_shape), event in sorted_items:
+                    if not input_shape:
+                        shape_string = []
+                    else:
+                        shapes = input_shape.split('\t')[:-1]
+                        shape_string = [
+                            '{}:{}'.format(*shape.split('-'))
+                            for shape in shapes
+                        ]
+                    if self.has_gpu:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "input_shape":
+                                shape_string,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "input_shape":
+                                shape_string,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+                    else:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "input_shape":
+                                shape_string,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                name,
+                                "calls":
+                                event.call,
+                                "input_shape":
+                                shape_string,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+
+        else:
+            if self.has_gpu:
+                sorted_items = sorted(
+                    self.operator_items.items(),
+                    key=lambda x: x[1].general_gpu_time,
+                    reverse=True)
+            else:
+                sorted_items = sorted(
+                    self.operator_items.items(),
+                    key=lambda x: x[1].cpu_time,
+                    reverse=True)
+
+            results = []
+            for op_name, item in sorted_items:
+                if search_name in op_name:
+                    results.append(op_name)
+
+            if group_by == 'op_name':
+                if self.has_gpu:
+                    data['column_name'] = [
+                        'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                        'cpu_max_time', 'cpu_min_time', 'cpu_ratio',
+                        'gpu_total_time', 'gpu_avg_time', 'gpu_max_time',
+                        'gpu_min_time', 'gpu_ratio'
+                    ]
+                    data['has_gpu'] = True
+                else:
+                    data['column_name'] = [
+                        'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
+                        'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
+                    ]
+                    data['has_gpu'] = False
+                for op_name in results:
+                    event = self.operator_items[op_name]
+                    if self.has_gpu:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                op_name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                op_name,
+                                "calls":
+                                event.call,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0),
+                                "gpu_total_time":
+                                format_time(event.general_gpu_time, time_unit),
+                                "gpu_avg_time":
+                                format_time(event.avg_general_gpu_time,
+                                            time_unit),
+                                "gpu_max_time":
+                                format_time(event.max_general_gpu_time,
+                                            time_unit),
+                                "gpu_min_time":
+                                format_time(event.min_general_gpu_time,
+                                            time_unit),
+                                "gpu_ratio":
+                                format_ratio(event.general_gpu_time /
+                                             total_gpu_time
+                                             if total_gpu_time != 0 else 0.0)
+                            })
+
+                    else:
+                        children_events = get_children_data(event)
+                        if children_events:
+                            data['events'].append({
+                                "name":
+                                op_name,
+                                "calls":
+                                event.call,
+                                "children":
+                                children_events,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+                        else:
+                            data['events'].append({
+                                "name":
+                                op_name,
+                                "calls":
+                                event.call,
+                                "cpu_total_time":
+                                format_time(event.cpu_time, time_unit),
+                                "cpu_avg_time":
+                                format_time(event.avg_cpu_time, time_unit),
+                                "cpu_max_time":
+                                format_time(event.max_cpu_time, time_unit),
+                                "cpu_min_time":
+                                format_time(event.min_cpu_time, time_unit),
+                                "cpu_ratio":
+                                format_ratio(event.cpu_time / total_cpu_time
+                                             if total_cpu_time != 0 else 0.0)
+                            })
+
+            else:
+                if self.has_gpu:
+                    data['column_name'] = [
+                        'name', 'calls', 'input_shape', 'cpu_total_time',
+                        'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
+                        'cpu_ratio', 'gpu_total_time', 'gpu_avg_time',
+                        'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
+                    ]
+                    data['has_gpu'] = True
+                else:
+                    data['column_name'] = [
+                        'name', 'calls', 'input_shape', 'cpu_total_time',
+                        'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
+                        'cpu_ratio'
+                    ]
+                    data['has_gpu'] = False
+                for op_name in results:
+                    for input_shape, event in self.operator_items_with_input_shape[
+                            op_name].items():
+                        if not input_shape:
+                            shape_string = []
+                        else:
+                            shapes = input_shape.split('\t')[:-1]
+                            shape_string = [
+                                '{}:{}'.format(*shape.split('-'))
+                                for shape in shapes
+                            ]
+                        if self.has_gpu:
+                            children_events = get_children_data(event)
+                            if children_events:
+                                data['events'].append({
+                                    "name":
+                                    op_name,
+                                    "calls":
+                                    event.call,
+                                    "children":
+                                    children_events,
+                                    "input_shape":
+                                    shape_string,
+                                    "cpu_total_time":
+                                    format_time(event.cpu_time, time_unit),
+                                    "cpu_avg_time":
+                                    format_time(event.avg_cpu_time, time_unit),
+                                    "cpu_max_time":
+                                    format_time(event.max_cpu_time, time_unit),
+                                    "cpu_min_time":
+                                    format_time(event.min_cpu_time, time_unit),
+                                    "cpu_ratio":
+                                    format_ratio(event.cpu_time /
+                                                 total_cpu_time if
+                                                 total_cpu_time != 0 else 0.0),
+                                    "gpu_total_time":
+                                    format_time(event.general_gpu_time,
+                                                time_unit),
+                                    "gpu_avg_time":
+                                    format_time(event.avg_general_gpu_time,
+                                                time_unit),
+                                    "gpu_max_time":
+                                    format_time(event.max_general_gpu_time,
+                                                time_unit),
+                                    "gpu_min_time":
+                                    format_time(event.min_general_gpu_time,
+                                                time_unit),
+                                    "gpu_ratio":
+                                    format_ratio(event.general_gpu_time /
+                                                 total_gpu_time if
+                                                 total_gpu_time != 0 else 0.0)
+                                })
+                            else:
+                                data['events'].append({
+                                    "name":
+                                    op_name,
+                                    "calls":
+                                    event.call,
+                                    "input_shape":
+                                    shape_string,
+                                    "cpu_total_time":
+                                    format_time(event.cpu_time, time_unit),
+                                    "cpu_avg_time":
+                                    format_time(event.avg_cpu_time, time_unit),
+                                    "cpu_max_time":
+                                    format_time(event.max_cpu_time, time_unit),
+                                    "cpu_min_time":
+                                    format_time(event.min_cpu_time, time_unit),
+                                    "cpu_ratio":
+                                    format_ratio(event.cpu_time /
+                                                 total_cpu_time if
+                                                 total_cpu_time != 0 else 0.0),
+                                    "gpu_total_time":
+                                    format_time(event.general_gpu_time,
+                                                time_unit),
+                                    "gpu_avg_time":
+                                    format_time(event.avg_general_gpu_time,
+                                                time_unit),
+                                    "gpu_max_time":
+                                    format_time(event.max_general_gpu_time,
+                                                time_unit),
+                                    "gpu_min_time":
+                                    format_time(event.min_general_gpu_time,
+                                                time_unit),
+                                    "gpu_ratio":
+                                    format_ratio(event.general_gpu_time /
+                                                 total_gpu_time if
+                                                 total_gpu_time != 0 else 0.0)
+                                })
+                        else:
+                            children_events = get_children_data(event)
+                            if children_events:
+                                data['events'].append({
+                                    "name":
+                                    op_name,
+                                    "calls":
+                                    event.call,
+                                    "children":
+                                    get_children_data(event),
+                                    "input_shape":
+                                    shape_string,
+                                    "cpu_total_time":
+                                    format_time(event.cpu_time, time_unit),
+                                    "cpu_avg_time":
+                                    format_time(event.avg_cpu_time, time_unit),
+                                    "cpu_max_time":
+                                    format_time(event.max_cpu_time, time_unit),
+                                    "cpu_min_time":
+                                    format_time(event.min_cpu_time, time_unit),
+                                    "cpu_ratio":
+                                    format_ratio(event.cpu_time /
+                                                 total_cpu_time if
+                                                 total_cpu_time != 0 else 0.0)
+                                })
+                            else:
+                                data['events'].append({
+                                    "name":
+                                    op_name,
+                                    "calls":
+                                    event.call,
+                                    "input_shape":
+                                    shape_string,
+                                    "cpu_total_time":
+                                    format_time(event.cpu_time, time_unit),
+                                    "cpu_avg_time":
+                                    format_time(event.avg_cpu_time, time_unit),
+                                    "cpu_max_time":
+                                    format_time(event.max_cpu_time, time_unit),
+                                    "cpu_min_time":
+                                    format_time(event.min_cpu_time, time_unit),
+                                    "cpu_ratio":
+                                    format_ratio(event.cpu_time /
+                                                 total_cpu_time if
+                                                 total_cpu_time != 0 else 0.0)
+                                })
+        return data
+
+    def get_kernel_pie(self, topk, time_unit='ms'):
+        data = OrderedDict()
+        data['column_name'] = [
+            "name", "calls", "total_time", "avg_time", "max_time", "min_time",
+            "mean blocks per sm", "mean est achieved occupancy",
+            "tensor core used", "ratio"
+        ]
+
+        data['events'] = []
+
+        sorted_items = sorted(
+            self.kernel_items.items(),
+            key=lambda x: x[1].gpu_time,
+            reverse=True)
+
+        if topk <= 0:
+            items = sorted_items
+        else:
+            items = sorted_items[:topk]
+
+        total_gpu_time = 0.0
+        for kernel_name, item in items:
+            total_gpu_time += item.gpu_time
+
+        for kernel_name, item in items:
+            gpu_stage_data = OrderedDict()
+            gpu_stage_data['name'] = kernel_name
+            gpu_stage_data['calls'] = item.call
+            gpu_stage_data['total_time'] = format_time(item.gpu_time,
+                                                       time_unit)
+            gpu_stage_data['avg_time'] = format_time(item.avg_gpu_time,
+                                                     time_unit)
+            gpu_stage_data['max_time'] = format_time(item.max_gpu_time,
+                                                     time_unit)
+            gpu_stage_data['min_time'] = format_time(item.min_gpu_time,
+                                                     time_unit)
+            gpu_stage_data['mean blocks per sm'] = format_float(
+                item.sum_blocks_per_sm / item.call)
+            gpu_stage_data['mean est achieved occupancy'] = format_float(
+                item.sum_occupancy / item.call)
+            gpu_stage_data['tensor core used'] = item.tensorcore_used
+            gpu_stage_data['ratio'] = format_ratio(
+                item.gpu_time / total_gpu_time)
+            data['events'].append(gpu_stage_data)
+        return data
+
+    def get_kernel_table(self, group_by='', search_name=None, time_unit='ms'):
+        data = OrderedDict()
+        data['events'] = []
+        total_gpu_time = 0
+        for name, event in self.kernel_items.items():
+            total_gpu_time += event.gpu_time
+        if not search_name:
+            if group_by == 'kernel_name':
+                data['column_name'] = [
+                    "name", "calls", "total_time", "avg_time", "max_time",
+                    "min_time", "mean blocks per sm",
+                    "mean est achieved occupancy", "tensor core used", "ratio"
+                ]
+                sorted_items = sorted(
+                    self.kernel_items.items(),
+                    key=lambda x: x[1].gpu_time,
+                    reverse=True)
+                for name, item in sorted_items:
+                    gpu_stage_data = OrderedDict()
+                    gpu_stage_data['name'] = name
+                    gpu_stage_data['calls'] = item.call
+                    gpu_stage_data['total_time'] = format_time(
+                        item.gpu_time, time_unit)
+                    gpu_stage_data['avg_time'] = format_time(
+                        item.avg_gpu_time, time_unit)
+                    gpu_stage_data['max_time'] = format_time(
+                        item.max_gpu_time, time_unit)
+                    gpu_stage_data['min_time'] = format_time(
+                        item.min_gpu_time, time_unit)
+                    gpu_stage_data['mean blocks per sm'] = format_float(
+                        item.sum_blocks_per_sm / item.call)
+                    gpu_stage_data[
+                        'mean est achieved occupancy'] = format_float(
+                            item.sum_occupancy / item.call)
+                    gpu_stage_data['tensor core used'] = item.tensorcore_used
+                    gpu_stage_data['ratio'] = format_ratio(
+                        item.gpu_time / total_gpu_time)
+                    data['events'].append(gpu_stage_data)
+            else:
+                data['column_name'] = [
+                    "name", "calls", "operator", "grid", "block",
+                    "register per thread", "shared memory", "total_time",
+                    "avg_time", "max_time", "min_time", "mean blocks per sm",
+                    "mean est achieved occupancy", "tensor core used", "ratio"
+                ]
+                new_arrange_data = {}
+                for name, items_with_attributes in self.kernel_items_with_op_name_attributes.items(
+                ):
+                    for attributes, item in items_with_attributes.items():
+                        new_arrange_data[(name, attributes)] = item
+                sorted_items = sorted(
+                    new_arrange_data.items(),
+                    key=lambda x: x[1].gpu_time,
+                    reverse=True)
+                for (name, attributes), item in sorted_items:
+                    operator, grid, block, register_per_thread, shared_memory = attributes.split(
+                        '-')
+                    gpu_stage_data = OrderedDict()
+                    gpu_stage_data['name'] = name
+                    gpu_stage_data['calls'] = item.call
+                    gpu_stage_data['operator'] = operator
+                    gpu_stage_data['grid'] = grid
+                    gpu_stage_data['block'] = block
+                    gpu_stage_data['register per thread'] = register_per_thread
+                    gpu_stage_data['shared memory'] = shared_memory
+                    gpu_stage_data['total_time'] = format_time(
+                        item.gpu_time, time_unit)
+                    gpu_stage_data['avg_time'] = format_time(
+                        item.avg_gpu_time, time_unit)
+                    gpu_stage_data['max_time'] = format_time(
+                        item.max_gpu_time, time_unit)
+                    gpu_stage_data['min_time'] = format_time(
+                        item.min_gpu_time, time_unit)
+                    gpu_stage_data['mean blocks per sm'] = format_float(
+                        item.sum_blocks_per_sm / item.call)
+                    gpu_stage_data[
+                        'mean est achieved occupancy'] = format_float(
+                            item.sum_occupancy / item.call)
+                    gpu_stage_data['tensor core used'] = item.tensorcore_used
+                    gpu_stage_data['ratio'] = format_ratio(
+                        item.gpu_time / total_gpu_time)
+                    data['events'].append(gpu_stage_data)
+
+        else:
+            sorted_items = sorted(
+                self.kernel_items.items(),
+                key=lambda x: x[1].gpu_time,
+                reverse=True)
+            results = []
+            for kernel_name, item in sorted_items:
+                if search_name in kernel_name:
+                    results.append(kernel_name)
+
+            if group_by == 'kernel_name':
+                data['column_name'] = [
+                    "name", "calls", "total_time", "avg_time", "max_time",
+                    "min_time", "mean blocks per sm",
+                    "mean est achieved occupancy", "tensor core used", "ratio"
+                ]
+                for kernel_name in results:
+                    item = self.kernel_items[kernel_name]
+                    gpu_stage_data = OrderedDict()
+                    gpu_stage_data['name'] = kernel_name
+                    gpu_stage_data['calls'] = item.call
+                    gpu_stage_data['total_time'] = format_time(
+                        item.gpu_time, time_unit)
+                    gpu_stage_data['avg_time'] = format_time(
+                        item.avg_gpu_time, time_unit)
+                    gpu_stage_data['max_time'] = format_time(
+                        item.max_gpu_time, time_unit)
+                    gpu_stage_data['min_time'] = format_time(
+                        item.min_gpu_time, time_unit)
+                    gpu_stage_data['mean blocks per sm'] = format_float(
+                        item.sum_blocks_per_sm / item.call)
+                    gpu_stage_data[
+                        'mean est achieved occupancy'] = format_float(
+                            item.sum_occupancy / item.call)
+                    gpu_stage_data['tensor core used'] = item.tensorcore_used
+                    gpu_stage_data['ratio'] = format_ratio(
+                        item.gpu_time / total_gpu_time)
+                    data['events'].append(gpu_stage_data)
+            else:
+                for kernel_name in results:
+                    for items_with_attributes, item in self.kernel_items_with_op_name_attributes[
+                            kernel_name].items():
+                        operator, grid, block, register_per_thread, shared_memory = attributes.split(
+                            '-')
+                        gpu_stage_data = OrderedDict()
+                        gpu_stage_data['name'] = kernel_name
+                        gpu_stage_data['calls'] = item.call
+                        gpu_stage_data['operator'] = operator
+                        gpu_stage_data['grid'] = grid
+                        gpu_stage_data['block'] = block
+                        gpu_stage_data[
+                            'register per thread'] = register_per_thread
+                        gpu_stage_data['shared memory'] = shared_memory
+                        gpu_stage_data['total_time'] = format_time(
+                            item.gpu_time, time_unit)
+                        gpu_stage_data['avg_time'] = format_time(
+                            item.avg_gpu_time, time_unit)
+                        gpu_stage_data['max_time'] = format_time(
+                            item.max_gpu_time, time_unit)
+                        gpu_stage_data['min_time'] = format_time(
+                            item.min_gpu_time, time_unit)
+                        gpu_stage_data['mean blocks per sm'] = format_float(
+                            item.sum_blocks_per_sm / item.call)
+                        gpu_stage_data[
+                            'mean est achieved occupancy'] = format_float(
+                                item.sum_occupancy / item.call)
+                        gpu_stage_data[
+                            'tensor core used'] = item.tensorcore_used
+                        gpu_stage_data['ratio'] = format_ratio(
+                            item.gpu_time / total_gpu_time)
+                        data['events'].append(gpu_stage_data)
+        return data
+
+    def get_kernel_tc_pie(self, topk, time_unit='ms'):
+        data = OrderedDict()
+        data['column_name'] = ["name", "calls", "ratio"]
+
+        data['events'] = []
+
+        sorted_items = sorted(
+            self.kernel_items.items(),
+            key=lambda x: x[1].gpu_time,
+            reverse=True)
+
+        if topk <= 0:
+            items = sorted_items
+        else:
+            items = sorted_items[:topk]
+
+        total_calls = 0.0
+        tensorcore_calls = 0.0
+        for kernel_name, item in items:
+            if item.tensorcore_used:
+                tensorcore_calls += item.call
+            total_calls += item.call
+
+        data['events'].append({
+            "name":
+            "Tensor core used",
+            "calls":
+            tensorcore_calls,
+            "ratio":
+            format_ratio(tensorcore_calls / total_calls)
+        })
+        data['events'].append({
+            "name":
+            "Tensor core unused",
+            "calls":
+            total_calls - tensorcore_calls,
+            "ratio":
+            format_ratio((total_calls - tensorcore_calls) / total_calls)
+        })
+        return data
+
+    def get_trace_data(self):
+        return self.trace_parser.content
+
+    def get_memory_devices(self):
+        data = []
+        for device in self.memory_curve.keys():
+            data.append({
+                "device":
+                device,
+                "min_size":
+                format_memory(self.size_ranges[device][0], 'KB'),
+                "max_size":
+                format_memory(self.size_ranges[device][1], 'KB'),
+                "max_allocation_size":
+                format_memory(self.peak_allocation_values[device], 'KB'),
+            })
+        return data
+
+    def get_memory_curve(self, device_type, time_unit='ms'):
+        curves = self.memory_curve[device_type]
+        data = {}
+        data['name'] = {
+            'Allocated': '已分配',
+            'Reserved': '已预留',
+            'PeakAllocated': '最大已分配',
+            'PeakReserved': '最大已预留'
+        }
+        for key, events in curves.items():
+            data[key] = []
+            sorted_events = sorted(events, key=lambda x: x[0])
+            for item in sorted_events:
+                timestamp = item[0]
+                size = item[1]
+                event_name = item[2]
+                data[key].append([
+                    format_time(timestamp, time_unit),
+                    format_memory(size, 'KB'), event_name
+                ])
+        return data
+
+    def get_memory_events(self,
+                          device_type,
+                          min_size=0,
+                          max_size=float('inf'),
+                          search_name=None,
+                          time_unit='ms'):
+        data = {}
+        data['column_name'] = [
+            'MemoryAddr', 'MemoryType', 'AllocatedEvent', 'AllocatedTimestamp',
+            'FreeEvent', 'FreeTimestamp', 'Duration', 'Size'
+        ]
+        data['data'] = []
+        paired_event_list = self.paired_events[device_type]
+
+        def filter_func(item):
+            nonlocal min_size
+            nonlocal max_size
+            nonlocal search_name
+            size = format_memory(item[-1], 'KB')
+            if not search_name:
+                if size >= min_size and size <= max_size:
+                    return True
+            else:
+                if size >= min_size and size <= max_size:
+                    if item[2]:
+                        if search_name in item[2]:
+                            return True
+                    if item[4]:
+                        if search_name in item[4]:
+                            return True
+            return False
+
+        paired_event_list = filter(filter_func, paired_event_list)
+        paired_event_list = sorted(paired_event_list, key=lambda x: x[-1])
+        if not paired_event_list:
+            return data
+        duration = None
+        for item in paired_event_list:
+            if item[3] and item[5]:
+                duration = item[5] - item[3]
+            else:
+                duration = None
+            data['data'].append({
+                "MemoryAddr":
+                item[0],
+                "MemoryType":
+                item[1],
+                "AllocatedEvent":
+                item[2],
+                "AllocatedTimestamp":
+                format_time(item[3], time_unit) if item[3] else None,
+                "FreeEvent":
+                item[4],
+                "FreeTimestamp":
+                format_time(item[5], time_unit) if item[5] else None,
+                "Duration":
+                format_time(duration, time_unit)
+                if duration is not None else None,
+                "Size":
+                format_memory(item[6], 'KB')
+            })
+        return data
+
+    def get_op_memory_events(self, device_type, search_name=None):
+        data = {}
+        data['column_name'] = [
+            'EventName', 'MemoryType', 'AllocationCount', 'FreeCount',
+            'AllocationSize', 'FreeSize', 'IncreasedSize'
+        ]
+        data['data'] = []
+        allocated_events = self.allocated_items[device_type]
+
+        def filter_func(item):
+            nonlocal search_name
+            if not search_name:
+                return True
+            else:
+                if search_name in item[0]:
+                    return True
+            return False
+
+        reserved_events = self.reserved_items[device_type]
+        all_events = [(key, item) for key, item in allocated_events.items()]
+        all_events.extend(
+            [(key, item) for key, item in reserved_events.items()])
+        if search_name:
+            all_events = filter(filter_func, all_events)
+        sorted_items = sorted(
+            all_events, key=lambda x: x[1].increase_size, reverse=True)
+        if not sorted_items:
+            return data
+        for event_name, item in sorted_items:
+            data['data'].append({
+                'EventName':
+                event_name,
+                'MemoryType':
+                item.memory_type,
+                'AllocationCount':
+                item.allocation_count,
+                'FreeCount':
+                item.free_count,
+                'AllocationSize':
+                format_memory(item.allocation_size, 'KB'),
+                'FreeSize':
+                format_memory(item.free_size, 'KB'),
+                'IncreasedSize':
+                format_memory(item.increase_size, 'KB')
+            })
+        return data
+
+
+class DistributedProfilerData:
+    '''
+    Hold data for distributed view.
+    Aggregate all data for distributed in ProfileData object.
+    '''
+
+    def __init__(self, run, span, profile_datas):
+        self.run = run
+        self.span = span
+        self.profile_datas = profile_datas
+
+    def get_distributed_info(self):
+        data = []
+        for profile_data in self.profile_datas:
+            device_infos = profile_data.device_infos
+            gpu_id = int(next(iter(profile_data.gpu_ids)))
+            data.append({
+                'worker_name':
+                profile_data.worker_name,
+                'process_id':
+                'pid: {}'.format(profile_data.process_id),
+                'device_id':
+                'GPU{}'.format(gpu_id),
+                'name':
+                device_infos[gpu_id]['name'],
+                'memory':
+                "{} GB".format(
+                    format_memory(device_infos[gpu_id]['totalGlobalMem'],
+                                  'GB')),
+                'computeCapability':
+                '{}.{}'.format(device_infos[gpu_id]['computeMajor'],
+                               device_infos[gpu_id]['computeMinor']),
+                'utilization':
+                '{}%'.format(format_ratio(profile_data.gpu_ulitization))
+            })
+        return data
+
+    def get_distributed_histogram(self, step, time_unit='ms'):
+        data = {}
+        data['order'] = [
+            "ProfileStep", "Communication", "Computation", "Overlap", "Others"
+        ]
+        data['worker_name'] = []
+        data['data'] = []
+        new_data = defaultdict(list)
+        for profile_data in self.profile_datas:
+            data['worker_name'].append(profile_data.worker_name)
+            if step != 'All':
+                new_data['ProfileStep'].append(
+                    format_time(
+                        profile_data.model_perspective_items['ProfileStep'].
+                        cpu_times[step], time_unit))
+            else:
+                new_data['ProfileStep'].append(
+                    format_time(
+                        profile_data.model_perspective_items['ProfileStep'].
+                        cpu_time, time_unit))
+            new_data['Communication'].append(
+                format_time(
+                    profile_data.distributed_time[step]['communication_time'],
+                    time_unit))
+            new_data['Computation'].append(
+                format_time(
+                    profile_data.distributed_time[step]['computation_time'],
+                    time_unit))
+            new_data['Overlap'].append(
+                format_time(
+                    profile_data.distributed_time[step]['overlap_time'],
+                    time_unit))
+            new_data['Others'].append(
+                format_time(profile_data.distributed_time[step]['others_time'],
+                            time_unit))
+        for order in data['order']:
+            data['data'].append(new_data[order])
+        return data
+
+    def get_distributed_steps(self):
+        for profile_data in self.profile_datas:
+            steps = list(profile_data.distributed_time.keys())
+            final_steps = ['All'] + sorted(
+                [int(step) for step in steps if step != 'All'])
+            return final_steps
--- a/visualdl/component/profiler/profiler_reader.py
+++ b/visualdl/component/profiler/profiler_reader.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import os
+import re
+from threading import Thread
+
+from multiprocess import Process
+from multiprocess import Queue
+
+from .parser.const_description import *  # noqa: F403
+from .parser.event_node import load_profiler_json
+from .run_manager import RunManager
+from visualdl.io import bfile
+
+_name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))")
+
+
+def is_VDLProfiler_file(path):
+    """Determine whether it is a paddle profile file that can be read by vdl according to the file name.
+
+    File name of a paddle profile file must contain `paddle_trace`.
+
+    Args:
+        path: File name to determine.
+    Returns:
+        True if the file is a paddle profile file, otherwise false.
+    """
+    if "paddle_trace" not in path:
+        return False
+    return True
+
+
+class ProfilerReader(object):
+    """Profile reader to read paddle profile files, support for frontend api in lib.py.
+    """
+
+    def __init__(self, logdir=''):
+        """Instance of ProfileReader
+
+        Args:
+            logdir: The dir include paddle profile files, multiple subfolders allowed.
+        """
+        if isinstance(logdir, str):
+            self.dir = [logdir]
+        else:
+            self.dir = logdir
+
+        self.walks = {}
+        self.displayname2runs = {}
+        self.runs2displayname = {}
+        self.run_managers = {}
+        self.profile_result_queue = Queue()
+        self.tempfile = None
+        self.runs()
+        Thread(target=self._get_data_from_queue, args=()).start()
+
+    @property
+    def logdir(self):
+        return self.dir
+
+    def get_all_walk(self):
+        flush_walks = {}
+        for dir in self.dir:
+            for root, dirs, files in bfile.walk(dir):
+                flush_walks.update({root: files})
+        return flush_walks
+
+    def get_run_manager(self, run):
+        if run in self.run_managers:
+            self.run_managers[run].join()
+            return self.run_managers[run]
+        else:
+            return None
+
+    def profile_runs(self, update=False):
+        """Get profile run files.
+
+        Every dir(means `run` in vdl) has may have more than one profiler file.
+
+        Returns:
+            walks: A dict like {"exp1": ["1587375595_paddle_trace.json", "1587375685_paddle_trace.json"],
+                                "exp2": ["1587375686_paddle_trace.json"]}
+        """
+        if not self.walks or update is True:
+            flush_walks = self.get_all_walk()
+
+            walks_temp = {}
+            for run, filenames in flush_walks.items():
+                tags_temp = [
+                    filename for filename in filenames
+                    if is_VDLProfiler_file(filename)
+                ]
+                if len(tags_temp) > 0:
+                    walks_temp.update({run: tags_temp})
+            self.walks = walks_temp
+        return self.walks
+
+    def runs(self, update=True):
+        self.profile_runs(update=update)
+        for run, filenames in self.walks.items():
+            if run not in self.run_managers:
+                self.run_managers[run] = RunManager(run)
+            self.run_managers[run].set_all_filenames(filenames)
+            for filename in filenames:
+                if self.run_managers[run].has_handled(filename):
+                    continue
+                self._read_data(run, filename)
+        return list(self.walks.keys())
+
+    def get_descriptions(self, lang):
+        if lang == 'zh':
+            return {
+                "overview_environment": TOOLTIP_DEVICE_INFO_CN,  # noqa: F405
+                "overview_model_perspective":
+                TOOLTIP_MODEL_PERSPECTIVE_CN,  # noqa: F405
+                "overview_model_perspective_perstep":
+                TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN,  # noqa: F405
+                "overview_event_type_perspective":
+                TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN,  # noqa: F405
+                "overview_event_type_model_perspective":
+                TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN,  # noqa: F405
+                "distributed_histogram":
+                TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN  # noqa: F405
+            }
+        else:
+            return {
+                "overview_environment": TOOLTIP_DEVICE_INFO_EN,  # noqa: F405
+                "overview_model_perspective":
+                TOOLTIP_MODEL_PERSPECTIVE_EN,  # noqa: F405
+                "overview_model_perspective_perstep":
+                TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN,  # noqa: F405
+                "overview_event_type_perspective":
+                TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN,  # noqa: F405
+                "overview_event_type_model_perspective":
+                TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN,  # noqa: F405
+                "distributed_histogram":
+                TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN  # noqa: F405
+            }
+
+    def set_displayname(self, log_reader):
+        self.displayname2runs = log_reader.name2tags
+        self.runs2displayname = log_reader.tags2name
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+    def _get_data_from_queue(self):
+        while True:
+            try:
+                run, filename, worker_name, profile_result = self.profile_result_queue.get(
+                )
+                self.run_managers[run].add_profile_result(
+                    filename, worker_name, profile_result)
+            except Exception as e:
+                print('Read profiler data error in multiprocess, error: {}'.
+                      format(e))
+
+    def _read_data(self, run, filename):
+        match = _name_pattern.match(filename)
+        if match:
+            worker_name = match.group(1)
+            if '.pb' in filename:
+                try:
+                    from paddle.profiler import load_profiler_result
+                except Exception:
+                    print(
+                        'Load paddle.profiler error. Please check paddle >= 2.3.0'
+                    )
+                    exit(0)
+                profile_result = load_profiler_result(
+                    os.path.join(run, filename))
+                self.run_managers[run].add_profile_result(
+                    filename, worker_name, profile_result)
+            else:
+
+                def _load_profiler_json(run, filename, worker_name):
+                    profile_result = load_profiler_json(
+                        os.path.join(run, filename))
+                    self.profile_result_queue.put((run, filename, worker_name,
+                                                   profile_result))
+
+                Process(
+                    target=_load_profiler_json,
+                    args=(run, filename, worker_name)).start()
--- a/visualdl/component/profiler/profiler_server.py
+++ b/visualdl/component/profiler/profiler_server.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import json
+
+from .profiler_reader import ProfilerReader
+from visualdl.server.api import gen_result
+from visualdl.server.api import result
+
+
+class ProfilerApi(object):
+    def __init__(self, logdir):
+        self._reader = ProfilerReader(logdir)
+
+    @result()
+    def runs(self):
+        return self._reader.runs()
+
+    @result()
+    def views(self, run):
+        run_manager = self._reader.get_run_manager(run)
+        if run_manager is None:
+            return []
+        return list(run_manager.get_views())
+
+    @result()
+    def workers(self, run, view):
+        if view == 'Distributed':
+            return ['All']
+        run_manager = self._reader.get_run_manager(run)
+        return run_manager.get_workers(view)
+
+    @result()
+    def spans(self, run, worker):
+        run_manager = self._reader.get_run_manager(run)
+        if worker == 'All':
+            return run_manager.get_distributed_spans()
+        return run_manager.get_spans(worker)
+
+    @result()
+    def timeunits(self):
+        return ['ns', 'us', 'ms', 's']
+
+    @result()
+    def descriptions(self, lang):
+        if lang == 'undefined' or lang is None:
+            lang = 'zh'
+        lang = lang.lower()
+        return self._reader.get_descriptions(lang)
+
+    @result()
+    def overview_environment(self, run, worker, span):
+        run_manager = self._reader.get_run_manager(run)
+        span = str(span)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        result = profiler_data.get_device_infos()
+        num_workers = len(run_manager.get_workers('Overview'))
+        result['num_workers'] = num_workers
+        return result
+
+    @result()
+    def model_perspective(self, run, worker, span, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_model_perspective(time_unit)
+
+    @result()
+    def model_perspective_perstep(self,
+                                  run,
+                                  worker,
+                                  span,
+                                  device_type,
+                                  time_unit='ms'):
+        device_type = device_type.lower()
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_model_perspective_perstep(
+            device_type, time_unit)
+
+    @result()
+    def event_type_perspective(self,
+                               run,
+                               worker,
+                               span,
+                               device_type,
+                               time_unit='ms'):
+        device_type = device_type.lower()
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_event_type_perspective(device_type, time_unit)
+
+    @result()
+    def event_type_model_perspective(self, run, worker, span, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_event_type_model_perspective(time_unit)
+
+    @result()
+    def userdefined_perspective(self, run, worker, span, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_userdefined_perspective(time_unit)
+
+    @result()
+    def operator_pie(self, run, worker, span, topk, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        topk = int(topk)
+        return profiler_data.get_operator_pie(topk, time_unit)
+
+    @result()
+    def operator_pie_expand(self, run, worker, span, topk, device_type,
+                            time_unit):
+        device_type = device_type.lower()
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+
+        topk = int(topk)
+        return profiler_data.get_operator_pie_expand(topk, device_type,
+                                                     time_unit)
+
+    @result()
+    def operator_table(self,
+                       run,
+                       worker,
+                       span,
+                       group_by,
+                       search_name,
+                       time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_operator_table(group_by, search_name,
+                                                time_unit)
+
+    @result()
+    def operator_stack_table(self,
+                             run,
+                             worker,
+                             span,
+                             op_name,
+                             group_by,
+                             input_shape,
+                             time_unit='ms'):
+        pass
+
+    @result()
+    def kernel_pie(self, run, worker, span, topk, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        topk = int(topk)
+        return profiler_data.get_kernel_pie(topk, time_unit)
+
+    @result()
+    def kernel_table(self,
+                     run,
+                     worker,
+                     span,
+                     group_by,
+                     search_name,
+                     time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_kernel_table(group_by, search_name, time_unit)
+
+    @result()
+    def kernel_tc_pie(self, run, worker, span, topk, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        topk = int(topk)
+        return profiler_data.get_kernel_tc_pie(topk, time_unit)
+
+    @result()
+    def distributed_info(self, run, worker, span):
+        run_manager = self._reader.get_run_manager(run)
+        distributed_profiler_data = run_manager.get_distributed_profiler_data(
+            span)
+        if distributed_profiler_data is None:
+            return
+        return distributed_profiler_data.get_distributed_info()
+
+    @result()
+    def distributed_steps(self, run, worker, span):
+        run_manager = self._reader.get_run_manager(run)
+        distributed_profiler_data = run_manager.get_distributed_profiler_data(
+            span)
+        return distributed_profiler_data.get_distributed_steps()
+
+    @result()
+    def distributed_histogram(self, run, worker, span, step, time_unit='ms'):
+        run_manager = self._reader.get_run_manager(run)
+        distributed_profiler_data = run_manager.get_distributed_profiler_data(
+            span)
+        return distributed_profiler_data.get_distributed_histogram(
+            step, time_unit)
+
+    @result(headers={'content-encoding': 'gzip'})
+    def trace(self, run, worker, span):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_trace_data()
+
+    @result()
+    def memory_devices(self, run, worker, span):
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_memory_devices()
+
+    @result(headers={'content-encoding': 'gzip'})
+    def memory_curve(self, run, worker, span, device_type, time_unit='ms'):
+        if device_type == 'undefined':
+            return
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_memory_curve(device_type, time_unit)
+
+    @result(headers={'content-encoding': 'gzip'})
+    def memory_events(self,
+                      run,
+                      worker,
+                      span,
+                      device_type,
+                      min_size=0,
+                      max_size=float('inf'),
+                      search_name=None,
+                      time_unit='ms'):
+        if device_type == 'undefined':
+            return
+        try:
+            min_size = float(min_size)
+        except Exception:
+            min_size = 0
+        try:
+            max_size = float(max_size)
+        except Exception:
+            max_size = float('inf')
+        if search_name == 'undefined' or not search_name:
+            search_name = None
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_memory_events(device_type, min_size, max_size,
+                                               search_name, time_unit)
+
+    @result(headers={'content-encoding': 'gzip'})
+    def op_memory_events(self,
+                         run,
+                         worker,
+                         span,
+                         device_type,
+                         search_name=None):
+        if search_name == 'undefined' or not search_name:
+            search_name = None
+        if device_type == 'undefined':
+            return
+        run_manager = self._reader.get_run_manager(run)
+        profiler_data = run_manager.get_profiler_data(worker, span)
+        return profiler_data.get_op_memory_events(device_type, search_name)
+
+    @result()
+    def comparison_phase(self, base_run, base_worker, base_span, exp_run,
+                         exp_worker, exp_span):
+        pass
+
+    @result()
+    def comparison_phase_diff(self, base_run, base_worker, base_span, exp_run,
+                              exp_worker, exp_span):
+        pass
+
+    @result()
+    def comparison_phase_table(self, base_run, base_worker, base_span, exp_run,
+                               exp_worker, exp_span):
+        pass
+
+    @result()
+    def comparison_phase_inner(self, base_run, base_worker, base_span, exp_run,
+                               exp_worker, exp_span, phase_name):
+        pass
+
+    @result()
+    def comparison_phase_diff_inner(self, base_run, base_worker, base_span,
+                                    exp_run, exp_worker, exp_span, phase_name):
+        pass
+
+    @result()
+    def comparison_phase_table_inner(self, base_run, base_worker, base_span,
+                                     exp_run, exp_worker, exp_span,
+                                     phase_name):
+        pass
+
+
+def create_profiler_api_call(logdir):
+    api = ProfilerApi(logdir)
+    routes = {
+        'runs': (api.runs, []),
+        'views': (api.views, ["run"]),
+        'workers': (api.workers, ["run", "view"]),
+        'spans': (api.spans, ["run", "worker"]),
+        'timeunits': (api.timeunits, []),
+        'descriptions': (api.descriptions, ["lang"]),
+        'overview/environment': (api.overview_environment,
+                                 ["run", "worker", "span"]),
+        'overview/model_perspective': (api.model_perspective,
+                                       ["run", "worker", "span", "time_unit"]),
+        'overview/model_perspective_perstep': (api.model_perspective_perstep, [
+            "run", "worker", "span", "device_type", "time_unit"
+        ]),
+        'overview/event_type_perspective': (api.event_type_perspective, [
+            "run", "worker", "span", "device_type", "time_unit"
+        ]),
+        'overview/event_type_model_perspective':
+        (api.event_type_model_perspective,
+         ["run", "worker", "span", "time_unit"]),
+        'overview/userdefined_perspective':
+        (api.userdefined_perspective, ["run", "worker", "span", "time_unit"]),
+        'operator/pie': (api.operator_pie,
+                         ["run", "worker", "span", "topk", "time_unit"]),
+        'operator/pie_expand': (api.operator_pie_expand, [
+            "run", "worker", "span", "topk", "device_type", "time_unit"
+        ]),
+        'operator/table': (api.operator_table, [
+            "run", "worker", "span", "group_by", "search_name", "time_unit"
+        ]),
+        'operator/stack_table': (api.operator_stack_table, [
+            "run", "worker", "span", "op_name", "group_by", "time_unit"
+            "input_shape"
+        ]),
+        'kernel/pie': (api.kernel_pie,
+                       ["run", "worker", "span", "topk", "time_unit"]),
+        'kernel/tensorcore_pie':
+        (api.kernel_tc_pie, ["run", "worker", "span", "topk", "time_unit"]),
+        'kernel/table': (api.kernel_table, [
+            "run", "worker", "span", "group_by", "search_name", "time_unit"
+        ]),
+        'distributed/info': (api.distributed_info, ["run", "worker", "span"]),
+        'distributed/steps': (api.distributed_steps, ["run", "worker",
+                                                      "span"]),
+        'distributed/histogram': (api.distributed_histogram, [
+            "run", "worker", "span", "step", "time_unit"
+        ]),
+        'trace': (api.trace, ["run", "worker", "span"]),
+        'memory/devices': (api.memory_devices, ["run", "worker", "span"]),
+        'memory/curve': (api.memory_curve,
+                         ["run", "worker", "span", "device_type",
+                          "time_unit"]),
+        'memory/memory_events': (api.memory_events, [
+            "run", "worker", "span", "device_type", "min_size", "max_size",
+            "search_name", "time_unit"
+        ]),
+        'memory/op_memory_events': (api.op_memory_events, [
+            "run", "worker", "span", "device_type", "search_name"
+        ]),
+        'comparison/phase': (api.comparison_phase, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span"
+        ]),
+        'comparison/phase_diff': (api.comparison_phase_diff, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span"
+        ]),
+        'comparison/phase_table': (api.comparison_phase_table, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span"
+        ]),
+        'comparison/phase_inner': (api.comparison_phase_inner, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span", "phase_name"
+        ]),
+        'comparison/phase_diff_inner': (api.comparison_phase_diff_inner, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span", "phase_name"
+        ]),
+        'comparison/phase_table_inner': (api.comparison_phase_table_inner, [
+            "base_run", "base_worker", "base_span", "exp_run", "exp_worker",
+            "exp_span", "phase_name"
+        ])
+    }
+
+    def call(path: str, args):
+        route = routes.get(path)
+        if not route:
+            return json.dumps(gen_result(
+                status=1, msg='api not found')), 'application/json', None
+        method, call_arg_names = route
+        call_args = [args.get(name) for name in call_arg_names]
+        return method(*call_args)
+
+    return call
--- a/visualdl/component/profiler/run_manager.py
+++ b/visualdl/component/profiler/run_manager.py
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+from collections import defaultdict
+from threading import Thread
+
+from .profiler_data import DistributedProfilerData
+from .profiler_data import ProfilerData
+
+
+class RunManager:
+    '''
+    Manage profile data for each run, each run may have multiple workers and spans.
+    We should manage profile data of each (worker, span) unit.
+    Besides, a special worker "all" is created to merge all profile data for distributed view.
+    '''
+
+    def __init__(self, run):
+        self.run = run
+        # worker:
+        #   span:
+        #       ProfileData
+        self.profiler_data = defaultdict(dict)
+        self.all_filenames = set()
+        self.handled_filenames = set()
+        # span:
+        #       DistributedProfileData
+        self.distributed_data = {}
+        self.threads = {}
+        self.has_join = False
+
+    def get_profiler_data(self, worker, span):
+        if worker in self.profiler_data:
+            if span in self.profiler_data[worker]:
+                return self.profiler_data[worker][span]
+
+    def get_distributed_profiler_data(self, span):
+        if span in self.distributed_data:
+            return self.distributed_data[span]
+
+    def get_views(self):
+        '''
+        Return all views supported in current run data.
+        '''
+        all_views = set()
+        for worker, span_data in self.profiler_data.items():
+            for span, profiler_data in span_data.items():
+                all_views.update(profiler_data.get_views())
+        ordered_views = [
+            'Overview', 'Operator', 'GPU Kernel', 'Distributed', 'Trace',
+            'Memory'
+        ]
+        final_views = []
+        for view in ordered_views:
+            if view in all_views:
+                final_views.append(view)
+        return final_views
+
+    def get_workers(self, view_name):
+        '''
+        Return all workers(processes) in current run data.
+        '''
+        workers = []
+        for worker, span_data in self.profiler_data.items():
+            for span, profiler_data in span_data.items():
+                if view_name in profiler_data.get_views():
+                    workers.append(worker)
+                    break
+        return workers
+
+    def get_spans(self, worker_name):
+        '''
+        Return all spans in current run data.
+        spans: Collecting profile data when training your model can be divided into several parts supported by\
+            paddle.profiler api, for example,  you may profile steps 2-4, 6-8. Each range is called a span here. \
+            And We index each span by orders.
+        '''
+        spans = list(self.profiler_data[worker_name].keys())
+        spans = sorted([int(span) for span in spans])
+        spans = [str(span) for span in spans]
+        return spans
+
+    def get_distributed_spans(self):
+        spans = list(self.distributed_data.keys())
+        spans = sorted([int(span) for span in spans])
+        spans = [str(span) for span in spans]
+        return spans
+
+    def _parse_file(self, worker_name, result):
+        span = result.get_span_idx()
+        self.profiler_data[worker_name][span] = ProfilerData(
+            self.run, worker_name, span, result)
+        return
+
+    def join(self):
+        if self.has_join:
+            return
+        for thread in self.threads.values():
+            thread.join()
+        self.has_join = True
+        distributed_profiler_data = defaultdict(list)
+        for worker_name, span_data in self.profiler_data.items():
+            for span_idx, profiler_data in span_data.items():
+                distributed_profiler_data[span_idx].append(profiler_data)
+        for span_idx, profiler_datas in distributed_profiler_data.items():
+            self.distributed_data[span_idx] = DistributedProfilerData(
+                self.run, span_idx, profiler_datas)
+
+    def add_profile_result(self, filename, worker_name, profile_result):
+        thread = Thread(
+            target=self._parse_file, args=(worker_name, profile_result))
+        thread.start()
+        self.handled_filenames.add(filename)
+        self.threads[filename] = thread
+
+    def set_all_filenames(self, filenames):
+        self.all_filenames.update(filenames)
+
+    def has_handled(self, filename):
+        if filename in self.handled_filenames:
+            return True
+        else:
+            return False
--- a/visualdl/server/api.py
+++ b/visualdl/server/api.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 # =======================================================================
 import functools
+import gzip
 import json
 import os
+from io import BytesIO

 from flask import request

@@ -52,6 +54,13 @@ def result(mimetype='application/json', headers=None):
                headers_output = headers(self)
            else:
                headers_output = headers
+                if headers is not None:
+                    if 'content-encoding' in headers:
+                        buf = BytesIO()
+                        with gzip.GzipFile(mode='wb', fileobj=buf) as fp:
+                            gzip_value = data.encode()
+                            fp.write(gzip_value)
+                        data = buf.getvalue()
            return data, mimetype, headers_output

        return wrapper

--- a/visualdl/server/app.py
+++ b/visualdl/server/app.py
@@ -32,6 +32,7 @@ from flask_babel import Babel

 import visualdl.server
 from visualdl import __version__
+from visualdl.component.profiler.profiler_server import create_profiler_api_call
 from visualdl.server.api import create_api_call
 from visualdl.server.args import parse_args
 from visualdl.server.args import ParseArgs
@@ -52,7 +53,7 @@ mock_data_path = os.path.join(SERVER_DIR, "./mock_data/")
 check_live_path = '/alive'


-def create_app(args):
+def create_app(args):  # noqa: C901
    # disable warning from flask
    cli = sys.modules['flask.cli']
    cli.show_server_banner = lambda *x: None
@@ -66,7 +67,7 @@ def create_app(args):
    app.config['BABEL_DEFAULT_LOCALE'] = default_language
    babel = Babel(app)
    api_call = create_api_call(args.logdir, args.model, args.cache_timeout)
-
+    profiler_api_call = create_profiler_api_call(args.logdir)
    if args.telemetry:
        update_util.PbUpdater(args.product).start()

@@ -134,6 +135,12 @@ def create_app(args):
        return make_response(
            Response(data, mimetype=mimetype, headers=headers))

+    @app.route(api_path + '/profiler/<path:method>', methods=["GET", "POST"])
+    def serve_profiler_api(method):
+        data, mimetype, headers = profiler_api_call(method, request.args)
+        return make_response(
+            Response(data, mimetype=mimetype, headers=headers))
+
    @app.route(check_live_path)
    def check_live():
        return '', 204

--- a/visualdl/version.py
+++ b/visualdl/version.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 # =======================================================================

-vdl_version = '2.3.0'
+vdl_version = '2.4.0'