未验证 提交 d251028d 编写于 作者: C chenjian 提交者: GitHub

Add profiler backend

上级 8b1815f3
...@@ -12,15 +12,14 @@ ...@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ======================================================================= # =======================================================================
from visualdl.proto.record_pb2 import Record
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from visualdl.proto.record_pb2 import Record
def scalar(tag, value, step, walltime=None): def scalar(tag, value, step, walltime=None):
"""Package data to one scalar. """Package data to one scalar.
Args: Args:
tag (string): Data identifier tag (string): Data identifier
value (float): Value of scalar value (float): Value of scalar
...@@ -52,8 +51,7 @@ def meta_data(tag='meta_data_tag', display_name="", step=0, walltime=None): ...@@ -52,8 +51,7 @@ def meta_data(tag='meta_data_tag', display_name="", step=0, walltime=None):
""" """
meta = Record.MetaData(display_name=display_name) meta = Record.MetaData(display_name=display_name)
return Record(values=[ return Record(values=[
Record.Value(id=step, tag=tag, timestamp=walltime, Record.Value(id=step, tag=tag, timestamp=walltime, meta_data=meta)
meta_data=meta)
]) ])
...@@ -82,8 +80,8 @@ def imgarray2bytes(np_array): ...@@ -82,8 +80,8 @@ def imgarray2bytes(np_array):
def make_grid(I, ncols=8): # noqa: E741 def make_grid(I, ncols=8): # noqa: E741
assert isinstance( assert isinstance(I,
I, np.ndarray), 'plugin error, should pass numpy array here' np.ndarray), 'plugin error, should pass numpy array here'
if I.shape[1] == 1: if I.shape[1] == 1:
I = np.concatenate([I, I, I], 1) # noqa: E741 I = np.concatenate([I, I, I], 1) # noqa: E741
assert I.ndim == 4 and I.shape[1] == 3 or I.shape[1] == 4 assert I.ndim == 4 and I.shape[1] == 3 or I.shape[1] == 4
...@@ -113,9 +111,11 @@ def convert_to_HWC(tensor, input_format): ...@@ -113,9 +111,11 @@ def convert_to_HWC(tensor, input_format):
Return: Return:
Image of format `HWC`. Image of format `HWC`.
""" """
assert(len(set(input_format)) == len(input_format)), "You can not use the same dimension shordhand twice. \ assert (len(set(input_format)) == len(input_format)
), "You can not use the same dimension shordhand twice. \
input_format: {}".format(input_format) input_format: {}".format(input_format)
assert(len(tensor.shape) == len(input_format)), "size of input tensor and input format are different. \ assert (len(tensor.shape) == len(input_format)
), "size of input tensor and input format are different. \
tensor shape: {}, input_format: {}".format(tensor.shape, input_format) tensor shape: {}, input_format: {}".format(tensor.shape, input_format)
input_format = input_format.upper() input_format = input_format.upper()
...@@ -129,7 +129,8 @@ def convert_to_HWC(tensor, input_format): ...@@ -129,7 +129,8 @@ def convert_to_HWC(tensor, input_format):
index = [input_format.find(c) for c in 'HWC'] index = [input_format.find(c) for c in 'HWC']
tensor_HWC = tensor.transpose(index) tensor_HWC = tensor.transpose(index)
if tensor_HWC.shape[2] == 1: if tensor_HWC.shape[2] == 1:
tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC], 2) tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC],
2)
return tensor_HWC return tensor_HWC
if len(input_format) == 2: if len(input_format) == 2:
...@@ -202,7 +203,8 @@ def embedding(tag, labels, hot_vectors, step, labels_meta=None, walltime=None): ...@@ -202,7 +203,8 @@ def embedding(tag, labels, hot_vectors, step, labels_meta=None, walltime=None):
for label, hot_vector in zip(labels, hot_vectors): for label, hot_vector in zip(labels, hot_vectors):
if not isinstance(label, list): if not isinstance(label, list):
label = [label] label = [label]
embeddings.embeddings.append(Record.Embedding(label=label, vectors=hot_vector)) embeddings.embeddings.append(
Record.Embedding(label=label, vectors=hot_vector))
return Record(values=[ return Record(values=[
Record.Value( Record.Value(
...@@ -325,7 +327,8 @@ def hparam(name, hparam_dict, metric_list, walltime): ...@@ -325,7 +327,8 @@ def hparam(name, hparam_dict, metric_list, walltime):
hparamInfo.string_value = v hparamInfo.string_value = v
hm.hparamInfos.append(hparamInfo) hm.hparamInfos.append(hparamInfo)
else: else:
print("The value of %s must be int, float or str, not %s" % (k, str(type(v)))) print("The value of %s must be int, float or str, not %s" %
(k, str(type(v))))
for metric in metric_list: for metric in metric_list:
metricInfo = Record.HParam.HparamInfo() metricInfo = Record.HParam.HparamInfo()
metricInfo.name = metric metricInfo.name = metric
...@@ -333,8 +336,7 @@ def hparam(name, hparam_dict, metric_list, walltime): ...@@ -333,8 +336,7 @@ def hparam(name, hparam_dict, metric_list, walltime):
hm.metricInfos.append(metricInfo) hm.metricInfos.append(metricInfo)
return Record(values=[ return Record(values=[
Record.Value( Record.Value(id=1, tag="hparam", timestamp=walltime, hparam=hm)
id=1, tag="hparam", timestamp=walltime, hparam=hm)
]) ])
...@@ -389,7 +391,12 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None): ...@@ -389,7 +391,12 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
return data return data
def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127, def pr_curve(tag,
labels,
predictions,
step,
walltime,
num_thresholds=127,
weights=None): weights=None):
"""Package data to one pr_curve. """Package data to one pr_curve.
...@@ -409,15 +416,16 @@ def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127, ...@@ -409,15 +416,16 @@ def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127,
num_thresholds = min(num_thresholds, 127) num_thresholds = min(num_thresholds, 127)
prcurve_map = compute_curve(labels, predictions, num_thresholds, weights) prcurve_map = compute_curve(labels, predictions, num_thresholds, weights)
return pr_curve_raw(tag=tag, return pr_curve_raw(
tp=prcurve_map['tp'], tag=tag,
fp=prcurve_map['fp'], tp=prcurve_map['tp'],
tn=prcurve_map['tn'], fp=prcurve_map['fp'],
fn=prcurve_map['fn'], tn=prcurve_map['tn'],
precision=prcurve_map['precision'], fn=prcurve_map['fn'],
recall=prcurve_map['recall'], precision=prcurve_map['precision'],
step=step, recall=prcurve_map['recall'],
walltime=walltime) step=step,
walltime=walltime)
def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime): def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
...@@ -441,7 +449,6 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime): ...@@ -441,7 +449,6 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
Return: Return:
Package with format of record_pb2.Record Package with format of record_pb2.Record
""" """
""" """
if isinstance(tp, np.ndarray): if isinstance(tp, np.ndarray):
tp = tp.astype(int).tolist() tp = tp.astype(int).tolist()
...@@ -456,15 +463,10 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime): ...@@ -456,15 +463,10 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
if isinstance(recall, np.ndarray): if isinstance(recall, np.ndarray):
recall = recall.astype(int).tolist() recall = recall.astype(int).tolist()
""" """
prcurve = Record.PRCurve(TP=tp, prcurve = Record.PRCurve(
FP=fp, TP=tp, FP=fp, TN=tn, FN=fn, precision=precision, recall=recall)
TN=tn,
FN=fn,
precision=precision,
recall=recall)
return Record(values=[ return Record(values=[
Record.Value( Record.Value(id=step, tag=tag, timestamp=walltime, pr_curve=prcurve)
id=step, tag=tag, timestamp=walltime, pr_curve=prcurve)
]) ])
...@@ -518,7 +520,13 @@ def compute_roc_curve(labels, predictions, num_thresholds=None, weights=None): ...@@ -518,7 +520,13 @@ def compute_roc_curve(labels, predictions, num_thresholds=None, weights=None):
return data return data
def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weights=None): def roc_curve(tag,
labels,
predictions,
step,
walltime,
num_thresholds=127,
weights=None):
"""Package data to one roc_curve. """Package data to one roc_curve.
Args: Args:
tag (string): Data identifier tag (string): Data identifier
...@@ -533,17 +541,19 @@ def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weig ...@@ -533,17 +541,19 @@ def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weig
Package with format of record_pb2.Record Package with format of record_pb2.Record
""" """
num_thresholds = min(num_thresholds, 127) num_thresholds = min(num_thresholds, 127)
roc_curve_map = compute_roc_curve(labels, predictions, num_thresholds, weights) roc_curve_map = compute_roc_curve(labels, predictions, num_thresholds,
weights)
return roc_curve_raw(tag=tag, return roc_curve_raw(
tp=roc_curve_map['tp'], tag=tag,
fp=roc_curve_map['fp'], tp=roc_curve_map['tp'],
tn=roc_curve_map['tn'], fp=roc_curve_map['fp'],
fn=roc_curve_map['fn'], tn=roc_curve_map['tn'],
tpr=roc_curve_map['tpr'], fn=roc_curve_map['fn'],
fpr=roc_curve_map['fpr'], tpr=roc_curve_map['tpr'],
step=step, fpr=roc_curve_map['fpr'],
walltime=walltime) step=step,
walltime=walltime)
def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime): def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
...@@ -563,7 +573,6 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime): ...@@ -563,7 +573,6 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
Return: Return:
Package with format of record_pb2.Record Package with format of record_pb2.Record
""" """
""" """
if isinstance(tp, np.ndarray): if isinstance(tp, np.ndarray):
tp = tp.astype(int).tolist() tp = tp.astype(int).tolist()
...@@ -578,12 +587,7 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime): ...@@ -578,12 +587,7 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
if isinstance(fpr, np.ndarray): if isinstance(fpr, np.ndarray):
fpr = fpr.astype(int).tolist() fpr = fpr.astype(int).tolist()
""" """
roc_curve = Record.ROC_Curve(TP=tp, roc_curve = Record.ROC_Curve(TP=tp, FP=fp, TN=tn, FN=fn, tpr=tpr, fpr=fpr)
FP=fp,
TN=tn,
FN=fn,
tpr=tpr,
fpr=fpr)
return Record(values=[ return Record(values=[
Record.Value( Record.Value(
id=step, tag=tag, timestamp=walltime, roc_curve=roc_curve) id=step, tag=tag, timestamp=walltime, roc_curve=roc_curve)
......
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
__ALL__ = [
'TOOLTIP_DEVICE_INFO_CN', 'TOOLTIP_MODEL_PERSPECTIVE_CN',
'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN',
'TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN',
'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN', 'TOOLTIP_DEVICE_INFO_EN',
'TOOLTIP_MODEL_PERSPECTIVE_EN', 'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN',
'TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN',
'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN'
]
TOOLTIP_DEVICE_INFO_CN = \
'<b class="bold">CPU进程利用率:</b><br>'\
'进程所利用到的CPU的时间 / ProfileStep的时间(即性能分析的时间跨度)<br>'\
'<b class="bold">CPU系统利用率:</b><br>'\
'整个系统所有进程利用到的CPU时间 / CPU总时间(ProfileStep的时间*CPU核心数)<br>'\
'<b class="bold">GPU利用率:</b><br>'\
'进程利用GPU计算的时间 / ProfileStep的时间,进程利用GPU计算的时间即是GPU Kernel计算的时间,越高越好<br>'\
'<b class="bold">流处理器效率:</b><br>'\
'对于流处理器处理某个GPU Kernel, 其效率为SM_Eff_i = min(Kernel所用的Blocks数量 / GPU的流处理器数量, 100%)。'\
'流处理器效率为SM_Eff_i关于每个Kernel的执行时间加权和 / ProfileStep的时间<br>'\
'<b class="bold">流处理器占用率:</b><br>'\
'对于流处理器处理某个GPU Kernel, 其占用率Occu_i = 为活跃的warp数 / 能支持的最大warp数。流处理器占用率为Occu_i关于每个Kernel执行时间的加权平均<br>'\
'<b class="bold">Tensor cores使用时间占比:</b><br>'\
'使用Tensor Cores的GPU Kernel的计算时间 / 所有Kernel的计算时间<br>'
TOOLTIP_MODEL_PERSPECTIVE_CN = \
'展示模型各阶段DataLoader, Forward, Backward, Optimization以及Other的总CPU和GPU时间。<br>'\
'CPU时间即是各阶段代码执行的时间,GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'\
'<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'\
'<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'\
'<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'\
'<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'\
'<b class="bold">Other:</b> 其它时间<br>'
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN = \
'展示每一个ProfileStep内模型各阶段DataLoader, Forward, Backward, Optimization以及Other的CPU和GPU时间。<br>'\
'CPU时间即是各阶段代码执行的时间,GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'\
'<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'\
'<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'\
'<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'\
'<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'\
'<b class="bold">Other:</b> 其它时间<br>'
TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN = \
'展示不同类型的事件在模型各阶段DataLoader, Forward, Backward, Optimization以及Other的分布。<br>'\
'<b class="bold">Operator:</b> 表示框架内的算子执行<br>'\
'<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'\
'<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'\
'<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'\
'<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'\
'<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'\
'<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'\
'<b class="bold">Communication:</b> 表示分布式通信有关的事件<br>'
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN = \
'展示在模型各阶段DataLoader, Forward, Backward, Optimization以及Other所包含的各种事件的时间。<br>'\
'<b class="bold">Operator:</b> 表示框架内的算子执行<br>'\
'<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'\
'<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'\
'<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'\
'<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'\
'<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'\
'<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'\
'<b class="bold">Communication:</b> 表示分布式通信有关的数据通信和计算事件<br>'
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN = \
'展示模型在每个迭代过程中通信、计算以及两者重叠部分的时间。<br>'\
'<b class="bold">ProfileStep:</b> 表示某一步迭代的总时间<br>'\
'<b class="bold">Communication:</b> 表示和通信相关的时间,包括框架内打的Communication事件、和通信有关的算子和Kernel(nccl)执行的时间<br>'\
'<b class="bold">Computation:</b> 表示GPU Kernel计算的时间,但是去除了和通信有关的Kernel(nccl)<br>'\
'<b class="bold">Overlap:</b> 表示通信和计算过程并行执行时候时间相互重叠的部分<br>'\
'<b class="bold">Others:</b> 表示通信和计算之外的时间<br>'
TOOLTIP_DEVICE_INFO_EN = \
'<b class="bold">CPU Process Utilization:</b><br>'\
'Process CPU time / ProfileStep time(total time of profiling)<br>'\
'<b class="bold">CPU System Utilization:</b><br>'\
'Sum of system\'s all processes CPU time/ CPU total time(ProfileStep time* #CPU Core)<br>'\
'<b class="bold">GPU Utilization:</b><br>'\
'GPU busy time / ProfileStep time,GPU busy time is the time during in which at least one GPU kernel is\
running on it.<br>'\
'<b class="bold">Est. SM Efficiency:</b><br>'\
'The SM efficiency for one kernel can be denoted as SM_Eff_i = min(blocks of this kernel / SM number \
of this GPU, 100%).'\
'Est. SM efficiency of GPU is the weighted sum of SM_Eff_i across all kernels / ProfileStep time<br>'\
'<b class="bold">Est. Achieved Occupancy:</b><br>'\
'The SM occupancy for one kernel can be denoted as Occu_i = active warps on an SM / maximum number \
of active warps supported by the SM. \
Est. SM occupancy of GPU is the weighted average of Occu_i across all kernels<br>'\
'<b class="bold">Tensor cores ratio:</b><br>'\
'Sum of kernel time using Tensor Cores / Sum of total kernel time<br>'
TOOLTIP_MODEL_PERSPECTIVE_EN = \
'Present CPU and GPU time for each stage of a model, i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'\
'CPU time is the execution time for code,GPU time is the calculation time of kernels launched in the stage.<br>'\
'<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'\
'<b class="bold">Forward:</b> denote model forward<br>'\
'<b class="bold">Backward:</b> denote gradient back-propagate<br>'\
'<b class="bold">Optimization:</b> denote parameters update<br>'\
'<b class="bold">Other:</b> other time out of above range'
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN = \
'Present CPU and GPU time in each ProfileStep for each stage of a model, \
i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'\
'CPU time is the execution time for code,GPU time is the calculation time of kernels launched in the stage.<br>'\
'<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'\
'<b class="bold">Forward:</b> denote model forward<br>'\
'<b class="bold">Backward:</b> denote gradient back-propagate<br>'\
'<b class="bold">Optimization:</b> denote parameters update<br>'\
'<b class="bold">Other:</b> other time out of above range'
TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN = \
'Present the distribution of each kind of events across DataLoader,\
Forward, Backward, Optimization and Other stage.<br>'\
'<b class="bold">Operator:</b> denote operator execution<br>'\
'<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'\
'<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'\
'<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'\
'<b class="bold">Memset:</b> denote memory data set on GPU<br>'\
'<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'\
'<b class="bold">OperatorInner:</b> denote operator\'s subprocess execution<br>'\
'<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN = \
'Present the time of each kind of events included in DataLoader, Forward, Backward, Optimization \
and Other stage.<br>'\
'<b class="bold">Operator:</b> denote operator execution<br>'\
'<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'\
'<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'\
'<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'\
'<b class="bold">Memset:</b> denote memory data set on GPU<br>'\
'<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'\
'<b class="bold">OperatorInner:</b> denote operator\'s subprocess execution<br>'\
'<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN = \
'Present the time of communication, computation and their overlap in program.<br>'\
'<b class="bold">ProfileStep:</b> denote an iteration step of training process<br>'\
'<b class="bold">Communication:</b> denote the time related to communication, including events of communication type\
in paddle framework、communication-related operators and GPU Kernels(nccl)<br>'\
'<b class="bold">Computation:</b> denote the computation \
time of GPU Kernels,except communication-related Kernels(nccl)<br>'\
'<b class="bold">Overlap:</b> denote the overlap time between Communication and \
Computation when they are executed parallelly.<br>'\
'<b class="bold">Others:</b> denote the time out of Communication and Computation<br>'
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from collections import defaultdict
from .utils import get_device_nodes
from .utils import intersection_ranges
from .utils import merge_ranges
from .utils import merge_self_ranges
from .utils import rebuild_node_trees
from .utils import sum_ranges
from .utils import traverse_tree
_CommunicationOpName = ['allreduce', 'broadcast', 'rpc']
class DistributedParser:
r"""
Analysis communication and computation time range, and their overlap.
The computation time is all kernel except kernels for communication like nccl.
"""
def __init__(self):
self.steps_data = defaultdict(lambda: defaultdict(list))
self.calls = defaultdict(lambda: defaultdict(int))
self.steps_time = defaultdict(lambda: defaultdict(float))
self.profile_steps_time = {}
def parse(self, nodetrees):
'''
Collect all communication and computation time ranges.
'''
total_time = 0.0
nodetrees = rebuild_node_trees(nodetrees)
thread2hostnodes = traverse_tree(nodetrees)
thread_count = 0
for threadid, hostnodes in thread2hostnodes.items():
for hostnode in hostnodes[1:]: # skip root node
# case 1: TracerEventType is Communication
if hostnode.type == 'ProfileStep':
if thread_count == 0:
total_time += (hostnode.end_ns - hostnode.start_ns)
self._parse_step(hostnode)
continue
thread_count += 1
new_steps_data = defaultdict(lambda: defaultdict(list))
self.profile_steps_time['All'] = total_time
for step, step_data in self.steps_data.items():
self.calls[step]['cpu_communication_range'] = len(
step_data['cpu_communication_range'])
self.calls[step]['gpu_communication_range'] = len(
step_data['gpu_communication_range'])
new_steps_data[step][
'cpu_communication_range'] = merge_self_ranges(
step_data['cpu_communication_range'], is_sorted=False)
new_steps_data[step][
'gpu_communication_range'] = merge_self_ranges(
step_data['gpu_communication_range'], is_sorted=False)
new_steps_data[step]['communication_range'] = merge_ranges(
new_steps_data[step]['cpu_communication_range'],
new_steps_data[step]['gpu_communication_range'],
is_sorted=True)
new_steps_data[step]['computation_range'] = merge_self_ranges(
step_data['computation_range'], is_sorted=False)
new_steps_data[step]['overlap_range'] = intersection_ranges(
new_steps_data[step]['communication_range'],
new_steps_data[step]['computation_range'],
is_sorted=True)
self.steps_time[step]['communication_time'] = sum_ranges(
new_steps_data[step]['communication_range'])
self.steps_time[step]['computation_time'] = sum_ranges(
new_steps_data[step]['computation_range'])
self.steps_time[step]['overlap_time'] = sum_ranges(
new_steps_data[step]['overlap_range'])
self.steps_time[step]['others_time'] = self.profile_steps_time[
step] - self.steps_time[step][
'communication_time'] - self.steps_time[step][
'computation_time'] + self.steps_time[step][
'overlap_time']
self.steps_data = new_steps_data
def _parse_step(self, profile_step_node):
step = profile_step_node.name.split('#')[1]
self.profile_steps_time[
step] = profile_step_node.end_ns - profile_step_node.start_ns
nodes = []
stack = []
stack.append(profile_step_node)
while stack:
current_node = stack.pop()
nodes.append(current_node)
for childnode in current_node.children_node:
stack.append(childnode)
for hostnode in nodes:
if hostnode.type == 'Communication':
self.steps_data[step]['cpu_communication_range'].append(
(hostnode.start_ns, hostnode.end_ns))
self.steps_data['All']['cpu_communication_range'].append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == 'Kernel':
self.steps_data[step][
'gpu_communication_range'].append(
(device_node.start_ns, device_node.end_ns))
self.steps_data['All'][
'gpu_communication_range'].append(
(device_node.start_ns, device_node.end_ns))
# case 2: TracerEventType is Operator but is communication op
elif hostnode.type == 'Operator' and any([
name in hostnode.name.lower()
for name in _CommunicationOpName
]):
self.steps_data[step]['cpu_communication_range'].append(
(hostnode.start_ns, hostnode.end_ns))
self.steps_data['All']['cpu_communication_range'].append(
(hostnode.start_ns, hostnode.end_ns))
device_nodes = get_device_nodes(hostnode)
for device_node in device_nodes:
if device_node.type == 'Kernel':
self.steps_data[step][
'gpu_communication_range'].append(
(device_node.start_ns, device_node.end_ns))
self.steps_data['All'][
'gpu_communication_range'].append(
(device_node.start_ns, device_node.end_ns))
# case 3: Others, filter kernels named with nccl
else:
for runtimenode in hostnode.runtime_node:
for devicenode in runtimenode.device_node:
if devicenode.type == 'Kernel':
if 'nccl' in devicenode.name.lower():
self.steps_data[step][
'gpu_communication_range'].append(
(devicenode.start_ns,
devicenode.end_ns))
self.steps_data['All'][
'gpu_communication_range'].append(
(devicenode.start_ns,
devicenode.end_ns))
else:
self.steps_data[step][
'computation_range'].append(
(devicenode.start_ns,
devicenode.end_ns))
self.steps_data['All'][
'computation_range'].append(
(devicenode.start_ns,
devicenode.end_ns))
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import collections
import functools
import json
import re
import sys
_show_name_pattern = re.compile(r'(.+)(\[.+\])')
_show_tid_pattern = re.compile(r'\w+(\(.+\))')
host_node_type_map = {
"Operator", "Dataloader", "ProfileStep", "CudaRuntime", "UserDefined",
"OperatorInner", "Forward", "Backward", "Optimization", "Communication",
"PythonOp", "PythonUserDefined"
}
device_node_type_map = {"Kernel", "Memcpy", "Memset"}
memory_node_event_map = {
"Allocate", "Free", "ReservedAllocate", "ReservedFree"
}
class HostNode:
def __init__(self):
self.name = None
self.type = None
self.start_ns = 0
self.end_ns = 0
self.process_id = 0
self.thread_id = 0
self.correlation_id = -1
self.input_shapes = {}
self.dtypes = {}
self.callstack = ""
self.children_node = []
self.runtime_node = []
self.device_node = []
self.mem_node = []
@classmethod
def from_json(cls, json_obj):
self = cls()
self.name = json_obj['name'].replace(
_show_name_pattern.match(json_obj['name']).group(2), "")
self.type = json_obj['cat']
self.start_ns = int(
float(json_obj['args']['start_time'].split(' ')[0]) * 1000)
self.end_ns = int(
float(json_obj['args']['end_time'].split(' ')[0]) * 1000)
self.process_id = json_obj['pid']
self.thread_id = json_obj['tid'].replace(
_show_tid_pattern.match(json_obj['tid']).group(1), "")
self.correlation_id = json_obj['args'][
'correlation id'] if 'correlation id' in json_obj['args'] else -1
self.input_shapes = json_obj['args'][
'input_shapes'] if 'input_shapes' in json_obj['args'] else {}
self.dtypes = json_obj['args'][
'input_dtypes'] if 'input_dtypes' in json_obj['args'] else {}
self.callstack = json_obj['args'][
'callstack'] if 'callstack' in json_obj['args'] else ""
self.children_node = []
self.runtime_node = []
self.device_node = []
self.mem_node = []
return self
class MemNode:
def __init__(self):
self.type = None
self.timestamp_ns = 0
self.addr = 0
self.process_id = 0
self.thread_id = 0
self.increase_bytes = 0
self.place = None
self.current_allocated = 0
self.current_reserved = 0
self.peak_allocated = 0
self.peak_reserved = 0
@classmethod
def from_json(cls, json_obj):
self = cls()
self.type = json_obj['cat']
self.timestamp_ns = json_obj['ts'] * 1000
self.addr = hex(int(
json_obj['args']['addr'])) if 'addr' in json_obj['args'] else 0
self.process_id = json_obj['pid']
self.thread_id = json_obj['tid'].replace(
_show_tid_pattern.match(json_obj['tid']).group(1), "")
self.increase_bytes = json_obj['args'][
'increase_bytes'] if 'increase_bytes' in json_obj['args'] else 0
self.place = json_obj['args']['place'] if 'place' in json_obj[
'args'] else "Place(cpu)"
self.current_allocated = json_obj['args'][
'current_allocated'] if 'current_allocated' in json_obj[
'args'] else 0
self.current_reserved = json_obj['args'][
'current_reserved'] if 'current_reserved' in json_obj['args'] else 0
self.peak_allocated = json_obj['args'][
'peak_allocated'] if 'peak_allocated' in json_obj['args'] else 0
self.peak_reserved = json_obj['args'][
'peak_reserved'] if 'peak_reserved' in json_obj['args'] else 0
return self
class DeviceNode:
def __init__(self):
self.name = None
self.type = None
self.start_ns = 0
self.end_ns = 0
self.device_id = 0
self.stream_id = 0
self.context_id = 0
self.correlation_id = 0
self.block_x, self.block_y, self.block_z = [0, 0, 0]
self.grid_x, self.grid_y, self.grid_z = [0, 0, 0]
self.shared_memory = 0
self.registers_per_thread = 0
self.num_bytes = 0
self.value = 0
self.occupancy = 0
self.blocks_per_sm = 0
self.warps_per_sm = 0
@classmethod
def from_json(cls, json_obj):
self = cls()
self.name = json_obj['name'].replace(
_show_name_pattern.match(json_obj['name']).group(2), "")
self.type = json_obj['cat']
self.start_ns = int(
float(json_obj['args']['start_time'].split(' ')[0]) * 1000)
self.end_ns = int(
float(json_obj['args']['end_time'].split(' ')[0]) * 1000)
self.device_id = json_obj['pid']
self.stream_id = json_obj['tid']
self.context_id = json_obj['args']['context'] if 'context' in json_obj[
'args'] else 0
self.correlation_id = json_obj['args']['correlation id']
self.block_x, self.block_y, self.block_z = json_obj['args'][
'block'] if 'block' in json_obj['args'] else [0, 0, 0]
self.grid_x, self.grid_y, self.grid_z = json_obj['args'][
'grid'] if 'grid' in json_obj['args'] else [0, 0, 0]
self.shared_memory = json_obj['args'][
'shared memory'] if 'shared memory' in json_obj['args'] else 0
self.registers_per_thread = json_obj['args'][
'registers per thread'] if 'registers per thread' in json_obj[
'args'] else 0
self.num_bytes = json_obj['args']['bytes'] if 'bytes' in json_obj[
'args'] else 0
self.value = json_obj['args']['value'] if 'value' in json_obj[
'args'] else 0
self.occupancy = json_obj['args'][
'theoretical achieved occupancy %'] if 'theoretical achieved occupancy %' in json_obj[
'args'] else 0
self.blocks_per_sm = json_obj['args'][
"blocks per SM"] if "blocks per SM" in json_obj['args'] else 0
self.warps_per_sm = json_obj['args'][
"warps per SM"] if "warps per SM" in json_obj['args'] else 0
return self
class ProfilerResult:
def __init__(self, json_data):
self.device_infos = None
self.span_idx = None
self.data = None
self.extra_info = None
self.schema_version = None
self.has_hostnodes = True
self.has_devicenodes = True
self.has_memnodes = True
self.parse(json_data)
self.content = json_data
self.start_in_timeline_ns = None
def parse(self, json_data):
self.schema_version = json_data['schemaVersion']
self.span_idx = json_data['span_indx']
self.device_infos = {
device_info['id']: device_info
for device_info in json_data['deviceProperties']
}
hostnodes = []
runtimenodes = []
devicenodes = []
memnodes = []
for event in json_data['traceEvents']:
if not event or (event['ph'] != 'X' and event['ph'] != 'i'):
continue
if event['cat'] in host_node_type_map:
if event['cat'] == 'CudaRuntime' or event[
'cat'] == 'MluRuntime':
runtimenodes.append(HostNode.from_json(event))
else:
hostnodes.append(HostNode.from_json(event))
if hostnodes[-1].start_ns == 0:
self.start_in_timeline_ns = int(event['ts']) * 1000
elif event['cat'] in device_node_type_map:
devicenodes.append(DeviceNode.from_json(event))
elif event['cat'] in memory_node_event_map:
memnodes.append(MemNode.from_json(event))
if memnodes:
for memnode in memnodes:
assert self.start_in_timeline_ns is not None
memnode.timestamp_ns = memnode.timestamp_ns - self.start_in_timeline_ns
if not hostnodes:
self.has_hostnodes = False
if not devicenodes:
self.has_devicenodes = False
if not memnodes:
self.has_memnodes = False
self.data = self.build_tree(hostnodes, runtimenodes, devicenodes,
memnodes)
self.extra_info = json_data['ExtraInfo']
def build_tree( # noqa: C901
self, hostnodes, runtimenodes, devicenodes, memnodes):
thread2host_event_nodes = collections.defaultdict(list)
thread2runtime_event_nodes = collections.defaultdict(list)
thread2mem_event_nodes = collections.defaultdict(list)
correlation_id2runtime_event_node = {}
thread_event_trees = {}
thread_ids = set()
for hostnode in hostnodes:
thread2host_event_nodes[hostnode.thread_id].append(hostnode)
thread_ids.add(hostnode.thread_id)
# construct thread2runtime_event_nodes and correlation_id2runtime_event_node
for runtimenode in runtimenodes:
thread2runtime_event_nodes[runtimenode.thread_id].append(
runtimenode)
thread_ids.add(runtimenode.thread_id)
correlation_id2runtime_event_node[
runtimenode.correlation_id] = runtimenode
# associate CudaRuntimeTraceEventNode and DeviceTraceEventNode
# construct correlation_id2device_event_nodes
for devicenode in devicenodes:
if devicenode.correlation_id not in correlation_id2runtime_event_node:
continue
runtimenode = correlation_id2runtime_event_node[
devicenode.correlation_id]
runtimenode.device_node.append(devicenode)
# construct thread2mem_event_nodes
for memnode in memnodes:
thread2mem_event_nodes[memnode.thread_id].append(memnode)
# sort host event nodes and runtime event nodes according to start_ns and
# end_ns
# the smaller start_ns is, the further ahead position is.
# when start_ns of two nodes are equal, the one with bigger end_ns should be
# ahead.
def compare_hostnode_func(hostnode1, hostnode2):
if hostnode1.start_ns < hostnode2.start_ns:
return -1
if hostnode1.start_ns == hostnode2.start_ns:
if hostnode1.end_ns > hostnode2.end_ns:
return -1
return 1
def compare_memnode_func(memnode1, memnode2):
if memnode1.timestamp_ns <= memnode2.timestamp_ns:
return -1
return 1
for threadid, hostnodes in thread2host_event_nodes.items():
thread2host_event_nodes[threadid] = sorted(
hostnodes, key=functools.cmp_to_key(compare_hostnode_func))
for threadid, runtimenodes in thread2runtime_event_nodes.items():
thread2runtime_event_nodes[threadid] = sorted(
runtimenodes, key=functools.cmp_to_key(compare_hostnode_func))
for threadid, memnodes in thread2mem_event_nodes.items():
thread2mem_event_nodes[threadid] = sorted(
memnodes, key=functools.cmp_to_key(compare_memnode_func))
# construct trees
for threadid in thread_ids:
thread_event_trees[threadid] = self._build_tree_relationship(
thread2host_event_nodes[threadid],
thread2runtime_event_nodes[threadid],
thread2mem_event_nodes[threadid])
return thread_event_trees
def _build_tree_relationship( # noqa: C901
self, host_event_nodes, runtime_event_nodes, mem_event_nodes):
# root node
root_node = HostNode()
root_node.name, root_node.type, root_node.start_ns, root_node.end_ns = "root node", "UserDefined", \
0, sys.maxsize
# push root node into node_stack
node_stack = []
node_stack.append(root_node)
# handle host_event_nodes
for host_node in host_event_nodes:
while True:
stack_top_node = node_stack[-1]
if host_node.start_ns < stack_top_node.end_ns:
stack_top_node.children_node.append(host_node)
node_stack.append(host_node)
break
else:
node_stack.pop()
# insert runtime node
# select runtime nodes which time range within stack_top_node
hasenter = False
firstposition = 0
lastposition = len(runtime_event_nodes)
for i, runtimenode in enumerate(runtime_event_nodes):
if runtimenode.start_ns >= stack_top_node.start_ns and \
runtimenode.end_ns <= stack_top_node.end_ns:
if not hasenter:
firstposition = i
hasenter = True
stack_top_node.runtime_node.append(runtimenode)
else:
# from this runtime node, not within stack_top_node, erase the
# nodes from runtime_event_nodes
if runtimenode.start_ns > stack_top_node.end_ns:
lastposition = i
break
if hasenter:
del runtime_event_nodes[firstposition:lastposition]
# to insert left runtimenode into host_event_nodes
while node_stack:
stack_top_node = node_stack.pop()
# insert runtime node
# select runtime nodes which time range within stack_top_node
firstposition = 0
lastposition = len(runtime_event_nodes)
hasenter = False
for i, runtimenode in enumerate(runtime_event_nodes):
if runtimenode.start_ns >= stack_top_node.start_ns and runtimenode.end_ns <= stack_top_node.end_ns:
if not hasenter:
firstposition = i
hasenter = True
stack_top_node.runtime_node.append(runtimenode)
else:
# from this runtime node, not within stack_top_node, erase the
# nodes from runtime_event_nodes
if runtimenode.start_ns > stack_top_node.end_ns:
lastposition = i
break
if hasenter:
del runtime_event_nodes[firstposition:lastposition]
# build relationship between host event node and mem event node
# First, post-order traverse the tree. Then, insert the memory and op
# supplement node into correct host nodes.
stack = []
flag_stack = []
post_order_nodes = []
stack.append(root_node)
flag_stack.append(0)
while stack:
current_node = stack.pop()
flag = flag_stack.pop()
if flag == 0:
stack.append(current_node)
flag_stack.append(1)
for child in current_node.children_node[::-1]:
stack.append(child)
flag_stack.append(0)
else:
post_order_nodes.append(current_node)
for node in post_order_nodes:
hasenter = False
firstposition = 0
lastposition = len(mem_event_nodes)
for i, mem_node in enumerate(mem_event_nodes):
if mem_node.timestamp_ns >= node.start_ns and mem_node.timestamp_ns <= node.end_ns:
node.mem_node.append(mem_node)
if not hasenter:
firstposition = i
hasenter = True
else:
if mem_node.timestamp_ns > node.end_ns:
lastposition = i
break
if hasenter:
del mem_event_nodes[firstposition:lastposition]
return root_node
def get_data(self):
return self.data
def get_extra_info(self):
return self.extra_info
def get_schema_version(self):
return self.schema_version
def get_device_infos(self):
return self.device_infos
def get_span_idx(self):
return self.span_idx
def has_device(self):
return self.has_devicenodes
def has_host(self):
return self.has_hostnodes
def has_memory(self):
return self.has_memnodes
def save(self, path, format):
pass
def load_profiler_json(file_name):
content = json.load(open(file_name, 'r'))
return ProfilerResult(content)
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import collections
class TC_Allowlist(dict):
# Refer to https://github.com/NVIDIA/PyProf/blob/fd1b2902e3306119eee40ba6b6e8b2f816920c29/pyprof/prof/tc.py#L19
allowlist = [
'h884', 's884', 'h1688', 's1688', 'hmma', 'i8816', '16816',
'dgrad_1x1_stride_2x2', 'first_layer_wgrad_kernel', 'conv1x1',
'conv2d_c1_k1', 'direct_group', 'xmma_implicit_gemm',
'xmma_sparse_conv', 'xmma_warp_specialized_implicit_gemm', 'xmma_gemm',
'xmma_sparse_gemm', 'c1688'
]
def __init__(self):
pass
def __contains__(self, item):
# If kernel name contains substring equal to any one in allowlist, then it uses tensor core.
for pattern in self.allowlist:
if pattern in item:
return True
return False
_allow_list = TC_Allowlist()
class DeviceItem:
def __init__(self, name):
self.name = name
self.call = 0
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.tensorcore_used = True if name in _allow_list else False
self.sum_blocks_per_sm = 0.0
self.sum_occupancy = 0.0
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_item(self, node):
self.call += 1
self.add_gpu_time(node.end_ns - node.start_ns)
self.sum_blocks_per_sm += node.blocks_per_sm
self.sum_occupancy += node.occupancy
class KernelParser:
def __init__(self):
self.kernel_items = {} # for kernel summary
self.kernel_items_with_op_name_attributes = collections.defaultdict(
dict)
self.gpu_ids = set()
self.occupancy = 0.0
self.sm_efficiency = 0.0
self.tensor_core_ratio = 0.0
def parse(self, nodelists): # noqa: C901
total_duration = 0.0
weighted_occupancy = 0.0
weighted_sm_efficiency = 0.0
for threadid, nodes in nodelists.items():
for node in nodes:
if node.type == 'Operator':
op_name = node.name
for children in node.children_node:
if children.type == 'OperatorInner':
for runtime_node in children.runtime_node:
for device_node in runtime_node.device_node:
if device_node.type == 'Kernel':
op_attribute_name = self._translate_op_name_attributes_to_string(
op_name, device_node)
if op_attribute_name not in self.kernel_items_with_op_name_attributes[
device_node.name]:
self.kernel_items_with_op_name_attributes[
device_node.name][
op_attribute_name] = DeviceItem(
device_node.name)
self.kernel_items_with_op_name_attributes[
device_node.
name][op_attribute_name].add_item(
device_node)
for runtime_node in node.runtime_node:
for device_node in runtime_node.device_node:
if device_node.type == 'Kernel':
op_attribute_name = self._translate_op_name_attributes_to_string(
op_name, device_node)
if op_attribute_name not in self.kernel_items_with_op_name_attributes[
device_node.name]:
self.kernel_items_with_op_name_attributes[
device_node.
name][op_attribute_name] = DeviceItem(
device_node.name)
self.kernel_items_with_op_name_attributes[
device_node.
name][op_attribute_name].add_item(
device_node)
elif node.type == 'OperatorInner':
continue
op_name = node.name
for runtime_node in node.runtime_node:
for device_node in runtime_node.device_node:
if device_node.type == 'Kernel':
op_attribute_name = self._translate_op_name_attributes_to_string(
op_name, device_node)
if op_attribute_name not in self.kernel_items_with_op_name_attributes[
device_node.name]:
self.kernel_items_with_op_name_attributes[
device_node.
name][op_attribute_name] = DeviceItem(
device_node.name)
self.kernel_items_with_op_name_attributes[
device_node.name][op_attribute_name].add_item(
device_node)
for threadid, nodes in nodelists.items():
for node in nodes:
for runtime_node in node.runtime_node:
for device_node in runtime_node.device_node:
if device_node.type == 'Kernel':
name = device_node.name
if name not in self.kernel_items:
self.kernel_items[name] = DeviceItem(name)
self.kernel_items[name].add_item(device_node)
weighted_occupancy += (
device_node.occupancy / 100) * (
device_node.end_ns - device_node.start_ns)
if device_node.blocks_per_sm > 1:
sm_efficiency = 1
else:
sm_efficiency = device_node.blocks_per_sm
weighted_sm_efficiency += sm_efficiency * (
device_node.end_ns - device_node.start_ns)
total_duration += (
device_node.end_ns - device_node.start_ns)
self.gpu_ids.add(device_node.device_id)
self.occupancy = weighted_occupancy / total_duration if total_duration != 0 else 0.0
self.sm_efficiency = weighted_sm_efficiency # to divide ProfileStep time in ProfileData
total_time = 0
total_tensorcore_time = 0
for name, node in self.kernel_items.items():
if node.tensorcore_used:
total_tensorcore_time += node.gpu_time
total_time += node.gpu_time
self.tensor_core_ratio = total_tensorcore_time / total_time if total_time != 0 else 0.0
def _translate_op_name_attributes_to_string(self, op_name, event):
result = '{}-[{},{},{}]-[{},{},{}]-{}-{}'.format(
op_name, event.grid_x, event.grid_y, event.grid_z, event.block_x,
event.block_y, event.block_z, event.registers_per_thread,
event.shared_memory)
return result
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import collections
from .utils import traverse_tree
class MemoryItem:
def __init__(self, event_name, place, memory_type='Allocated'):
self.event_name = event_name
self.place = place
self.allocation_count = 0
self.free_count = 0
self.allocation_size = 0
self.free_size = 0
self.increase_size = 0
self.memory_type = memory_type
def add_memory_record(self, size, allocation_type):
if allocation_type == 'Allocate' or allocation_type == 'ReservedAllocate':
self.allocation_count += 1
self.allocation_size += size
elif allocation_type == 'Free' or allocation_type == 'ReservedFree':
self.free_count += 1
self.free_size -= size # size is sign(-) when free.
else:
print("No corresponding type.")
self.increase_size = self.allocation_size - self.free_size
class MemoryParser:
def __init__(self):
self.allocated_items = collections.defaultdict(
dict) # for memory summary, device type: event
self.reserved_items = collections.defaultdict(
dict) # for memory summary, device type: event
self.peak_allocation_values = collections.defaultdict(int)
self.peak_reserved_values = collections.defaultdict(int)
self.memory_events = collections.defaultdict(lambda: collections.
defaultdict(list))
self.memory_curve = collections.defaultdict(lambda: collections.
defaultdict(list))
self.paired_events = collections.defaultdict(list)
self.size_ranges = {}
def parse(self, nodetrees): # noqa: C901
r"""
Analyse memory event in the nodetress.
"""
thread2hostnodes = traverse_tree(nodetrees)
for threadid, host_nodes in thread2hostnodes.items():
for host_node in host_nodes[1:]: # skip root node
if host_node.type == 'OperatorInner':
continue
if host_node.type == 'Operator':
for child in host_node.children_node:
self._analyse_node_memory(host_node.name, child)
self._analyse_node_memory(host_node.name, host_node)
# pair for memory events
for device_type, memory_events in self.memory_events.items():
max_size = 0
for (addr, memory_type), memory_lists in memory_events.items():
memory_lists = sorted(memory_lists, key=lambda x: x[0])
paired_results = []
for memory_list in memory_lists:
timestamp, memory_type, hostnodename, size = memory_list
if memory_type == 'Allocate' or memory_type == 'ReservedAllocate':
if size > max_size:
max_size = size
if memory_type == 'Allocate':
paired_results.append([
addr, 'Allocated', hostnodename, timestamp,
None, None, size
])
else:
paired_results.append([
addr, 'ReservedAllocate', hostnodename,
timestamp, None, None, size
])
elif memory_type == 'Free' or memory_type == 'ReservedFree':
if -size > max_size:
max_size = -size
if paired_results:
if paired_results[-1][-3] is None:
paired_results[-1][-3] = hostnodename
paired_results[-1][-2] = timestamp
self.paired_events[device_type].append(
paired_results.pop())
else:
if memory_type == 'Free':
paired_results.append([
addr, 'Allocated', None, None,
hostnodename, timestamp, -size
])
else:
paired_results.append([
addr, 'ReservedAllocate', None, None,
hostnodename, timestamp, -size
])
self.paired_events[device_type].append(
paired_results.pop())
else:
if memory_type == 'Free':
paired_results.append([
addr, 'Allocated', None, None,
hostnodename, timestamp, -size
])
else:
paired_results.append([
addr, 'ReservedAllocate', None, None,
hostnodename, timestamp, -size
])
self.paired_events[device_type].append(
paired_results.pop())
self.paired_events[device_type].extend(paired_results)
self.size_ranges[device_type] = (0, max_size)
def _analyse_node_memory(self, event_name, node):
for memnode in node.mem_node: # self mem node
if memnode.type == 'Allocate' or memnode.type == 'Free':
if event_name not in self.allocated_items[memnode.place]:
self.allocated_items[
memnode.place][event_name] = MemoryItem(
event_name, memnode.place, 'Allocated')
self.allocated_items[
memnode.place][event_name].add_memory_record(
memnode.increase_bytes, memnode.type)
self.memory_events[memnode.place][(memnode.addr,
'Allocated')].append([
memnode.timestamp_ns,
memnode.type,
event_name,
memnode.increase_bytes
])
elif memnode.type == 'ReservedAllocate' or memnode.type == 'ReservedFree':
if event_name not in self.reserved_items[memnode.place]:
self.reserved_items[
memnode.place][event_name] = MemoryItem(
event_name, memnode.place, 'Reserved')
self.reserved_items[
memnode.place][event_name].add_memory_record(
memnode.increase_bytes, memnode.type)
self.memory_events[memnode.place][(memnode.addr,
"Reserved")].append([
memnode.timestamp_ns,
memnode.type,
event_name,
memnode.increase_bytes
])
self.memory_curve[memnode.place]['Allocated'].append(
(memnode.timestamp_ns, memnode.current_allocated, event_name))
self.memory_curve[memnode.place]['Reserved'].append(
(memnode.timestamp_ns, memnode.current_reserved, event_name))
self.memory_curve[memnode.place]['PeakAllocated'].append(
(memnode.timestamp_ns, memnode.peak_allocated, event_name))
self.memory_curve[memnode.place]['PeakReserved'].append(
(memnode.timestamp_ns, memnode.peak_reserved, event_name))
self.peak_allocation_values[memnode.place] = max(
self.peak_allocation_values[memnode.place],
memnode.peak_allocated)
self.peak_reserved_values[memnode.place] = max(
self.peak_reserved_values[memnode.place],
memnode.peak_reserved)
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import collections
from .kernel_parser import DeviceItem
from .utils import wrap_tree
class OperatorItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.gpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.devices = {}
self.operator_inners = {}
self.general_gpu_time = 0
self.min_general_gpu_time = float('inf')
self.max_general_gpu_time = 0
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
@property
def avg_general_gpu_time(self):
return self.general_gpu_time / self.call
def add_cpu_time(self, time):
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_general_gpu_time(self, time):
if time > self.max_general_gpu_time:
self.max_general_gpu_time = time
if time < self.min_general_gpu_time:
self.min_general_gpu_time = time
self.general_gpu_time += time
def add_call(self):
self.call += 1
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
self.add_gpu_time(node.gpu_time)
self.add_general_gpu_time(node.general_gpu_time)
for child in node.children_node:
if child.type != 'Operator':
if child.name not in self.operator_inners:
self.operator_inners[child.name] = OperatorItem(child.name)
self.operator_inners[child.name].add_item(child)
for runtimenode in node.runtime_node:
for devicenode in runtimenode.device_node:
name = devicenode.name
if name not in self.devices:
self.devices[name] = DeviceItem(name)
self.devices[name].add_item(devicenode)
class OperatorParser:
r"""
Analyse operator event in profiling data, correlate with its device event.
"""
def __init__(self):
self.items = {} # for operator summary
self.items_with_input_shape = collections.defaultdict(dict)
def parse(self, nodetrees):
r"""
Analysis operator event in the nodetress.
"""
node_statistic_trees, thread2host_statistic_nodes = wrap_tree(
nodetrees)
for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
):
for host_statistic_node in host_statistic_nodes[
1:]: # skip root node
if host_statistic_node.type == 'Operator':
self.add_operator_item(host_statistic_node)
def add_operator_item(self, operator_node):
if operator_node.name not in self.items:
self.items[operator_node.name] = OperatorItem(operator_node.name)
input_shape_str = self._translate_op_input_shape_to_string(
operator_node.input_shapes)
if input_shape_str not in self.items_with_input_shape[
operator_node.name]:
self.items_with_input_shape[
operator_node.name][input_shape_str] = OperatorItem(
operator_node.name)
self.items[operator_node.name].add_item(operator_node)
self.items_with_input_shape[
operator_node.name][input_shape_str].add_item(operator_node)
def _translate_op_input_shape_to_string(self, input_shape):
result = ''
for arg, shape in input_shape.items():
result += '{}-{}\t'.format(arg, shape)
return result
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import collections
from .utils import merge_ranges
from .utils import merge_self_ranges
from .utils import rebuild_node_trees
from .utils import sum_ranges
from .utils import traverse_tree
StageType = ['Dataloader', 'Forward', 'Backward', 'Optimization']
CPUType = [
'Operator', 'CudaRuntime', 'UserDefined', 'OperatorInner', 'Communication',
'PythonOp', 'PythonUserDefined', 'MluRuntime'
]
GPUType = ['Kernel', 'Memcpy', 'Memset']
class GeneralItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.general_gpu_time = 0
self.min_general_gpu_time = float('inf')
self.max_general_gpu_time = 0
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
@property
def avg_general_gpu_time(self):
return self.general_gpu_time / self.call
def add_cpu_time(self, time):
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
self.gpu_time += time
def add_general_gpu_time(self, time):
if time > self.max_general_gpu_time:
self.max_general_gpu_time = time
if time < self.min_general_gpu_time:
self.min_general_gpu_time = time
self.general_gpu_time += time
def add_call(self):
self.call += 1
def add_item(self, node):
self.add_call()
self.add_cpu_time(node.cpu_time)
self.add_gpu_time(node.gpu_time)
self.add_general_gpu_time(node.general_gpu_time)
class ModelPerspectiveItem:
def __init__(self, name):
self.name = name
self.call = 0
self.cpu_time = 0
self.max_cpu_time = 0
self.min_cpu_time = float('inf')
self.gpu_time = 0
self.max_gpu_time = 0
self.min_gpu_time = float('inf')
self.cpu_times = {}
self.gpu_times = {}
@property
def avg_cpu_time(self):
return self.cpu_time / self.call
@property
def avg_gpu_time(self):
return self.gpu_time / self.call
def add_call(self):
self.call += 1
def add_cpu_time(self, time):
self.add_call()
if time > self.max_cpu_time:
self.max_cpu_time = time
if time < self.min_cpu_time:
self.min_cpu_time = time
self.cpu_time += time
def add_gpu_time(self, time):
if time > self.max_gpu_time:
self.max_gpu_time = time
if time < self.min_gpu_time:
self.min_gpu_time = time
def set_gpu_time(self, time):
'''
Use this to set total gpu time in case gpu time calculated by add_gpu_time include overlap.
'''
self.gpu_time = time
class OverviewParser:
r"""
Analyse time ranges for each TracerEventType, and summarize the time.
"""
def __init__(self):
# event name: GeneralItem
self.memory_manipulation_items = {} # for memory manipulation summary
self.userdefined_items = {} # for userdefined summary
self.model_perspective_items = {}
# phase name:
# device name:
# stage idx:
# thread name:
# event type:
# {"events" :[], "times": []}
self.events_per_stage = collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.defaultdict(
lambda: collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.
defaultdict(list))))))
# phase name:
# device name:
# stage idx:
# event type:
# { "calls" :[], "times": [], "total_time": 0 }
self.merged_events_per_stage = collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.defaultdict(
lambda: collections.defaultdict(lambda: collections.
defaultdict(list)))))
self.stage_nums = 0
self.gpu_ulitization = 0.0
self.has_forward = False
self.has_device = False
def parse(self, nodetrees): # noqa: C901
r"""
Analysis node trees in profiler result, and get time range for different tracer event type.
"""
self._parse_events(nodetrees)
# statistic calling times
# merge time, get time summarization
for stage_name, stage_data in self.events_per_stage.items():
for device_name, steps_data in stage_data.items():
for step_idx, thread_data in steps_data.items():
for thread_id, events in thread_data.items():
for event_type, events_data in events.items():
if 'calls' not in self.merged_events_per_stage[
stage_name][device_name][step_idx][
event_type]:
self.merged_events_per_stage[stage_name][
device_name][step_idx][event_type][
'calls'] = 0
if 'total_time' not in self.merged_events_per_stage[
stage_name][device_name][step_idx][
event_type]:
self.merged_events_per_stage[stage_name][
device_name][step_idx][event_type][
'total_time'] = 0
events_data['times'] = merge_self_ranges(
events_data['times'], is_sorted=False)
self.merged_events_per_stage[stage_name][
device_name][step_idx][event_type][
'calls'] += len(events_data['events'])
self.merged_events_per_stage[stage_name][device_name][step_idx][event_type]['times'] =\
merge_ranges(
self.merged_events_per_stage[stage_name][device_name][step_idx][event_type]['times'],
events_data['times'], is_sorted=True)
# merge different stages into profile step
stage_names = list(self.merged_events_per_stage.keys())
self.merged_events_per_stage['ProfileStep']
for stage_name in stage_names:
stage_data = self.merged_events_per_stage[stage_name]
for device_name, steps_data in stage_data.items():
for step_idx, events in steps_data.items():
for event_type, events_data in events.items():
events_data['total_time'] = sum_ranges(
events_data['times'])
if 'calls' not in self.merged_events_per_stage[
'ProfileStep'][device_name][step_idx][
event_type]:
self.merged_events_per_stage['ProfileStep'][
device_name][step_idx][event_type]['calls'] = 0
if 'total_time' not in self.merged_events_per_stage[
'ProfileStep'][device_name][step_idx][
event_type]:
self.merged_events_per_stage['ProfileStep'][
device_name][step_idx][event_type][
'total_time'] = 0
self.merged_events_per_stage['ProfileStep'][
device_name][step_idx][event_type][
'calls'] += events_data['calls']
self.merged_events_per_stage['ProfileStep'][
device_name][step_idx][event_type][
'total_time'] += events_data['total_time']
self.merged_events_per_stage['ProfileStep'][
device_name][step_idx][event_type][
'times'] = merge_ranges(
self.merged_events_per_stage['ProfileStep']
[device_name][step_idx][event_type]
['times'],
events_data['times'],
is_sorted=True)
# add gpu time for model perspective summary
for stage_name, stage_data in self.merged_events_per_stage.items():
for device_name, steps_data in stage_data.items():
for step_idx, events in steps_data.items():
if 'Kernel' in events:
if step_idx == 'ALL':
self.model_perspective_items[
stage_name].set_gpu_time(
events['Kernel']['total_time'])
continue
self.model_perspective_items[stage_name].add_gpu_time(
events['Kernel']['total_time'])
self.model_perspective_items[stage_name].gpu_times[
step_idx] = events['Kernel']['total_time']
if self.has_device:
self.gpu_ulitization = self.merged_events_per_stage['ProfileStep'][
'GPU']['ALL']['Kernel'][
'total_time'] / self.model_perspective_items[
'ProfileStep'].cpu_time
def _fill_stage_events( # noqa: C901
self, node, stage_idx, should_recursive=True):
if node.type == 'Forward':
stage_name = 'Forward'
self.has_forward = True
elif node.type == 'Backward':
stage_name = 'Backward'
elif node.type == 'Optimization':
stage_name = 'Optimization'
elif node.type == 'Dataloader':
stage_name = 'Dataloader'
else:
stage_name = 'Other'
if should_recursive:
stack = []
if node.type in StageType:
for children in node.children_node:
stack.append(children)
else:
stack.append(node)
while stack:
current_node = stack.pop()
for childnode in current_node.children_node:
stack.append(childnode)
for runtimenode in current_node.runtime_node:
self.events_per_stage[stage_name]["CPU"][stage_idx][
runtimenode.thread_id][
runtimenode.type]['events'].append(runtimenode)
self.events_per_stage[stage_name]["CPU"][stage_idx][
runtimenode.thread_id][
runtimenode.type]['times'].append(
(runtimenode.start_ns, runtimenode.end_ns))
self.events_per_stage[stage_name]["CPU"]['ALL'][
runtimenode.thread_id][
runtimenode.type]['events'].append(runtimenode)
self.events_per_stage[stage_name]["CPU"]['ALL'][
runtimenode.thread_id][
runtimenode.type]['times'].append(
(runtimenode.start_ns, runtimenode.end_ns))
for devicenode in runtimenode.device_node:
self.has_device = True
self.events_per_stage[stage_name]["GPU"][stage_idx][
devicenode.stream_id][
devicenode.type]['events'].append(devicenode)
self.events_per_stage[stage_name]["GPU"][stage_idx][
devicenode.stream_id][
devicenode.type]['times'].append(
(devicenode.start_ns, devicenode.end_ns))
self.events_per_stage[stage_name]["GPU"]['ALL'][
devicenode.stream_id][
devicenode.type]['events'].append(devicenode)
self.events_per_stage[stage_name]["GPU"]['ALL'][
devicenode.stream_id][
devicenode.type]['times'].append(
(devicenode.start_ns, devicenode.end_ns))
if current_node.type == 'Forward' or current_node.type == 'UserDefined':
continue
node_type = current_node.type
if node_type == 'PythonUserDefined':
node_type = 'UserDefined'
self.events_per_stage[stage_name]["CPU"][stage_idx][
current_node.thread_id][node_type]['events'].append(
current_node)
self.events_per_stage[stage_name]["CPU"][stage_idx][
current_node.thread_id][node_type]['times'].append(
(current_node.start_ns, current_node.end_ns))
self.events_per_stage[stage_name]["CPU"]['ALL'][
current_node.thread_id][node_type]['events'].append(
current_node)
self.events_per_stage[stage_name]["CPU"]['ALL'][
current_node.thread_id][node_type]['times'].append(
(current_node.start_ns, current_node.end_ns))
else:
for runtimenode in node.runtime_node:
self.events_per_stage[stage_name]["CPU"][stage_idx][
runtimenode.thread_id][runtimenode.type]['events'].append(
runtimenode)
self.events_per_stage[stage_name]["CPU"][stage_idx][
runtimenode.thread_id][runtimenode.type]['times'].append(
(runtimenode.start_ns, runtimenode.end_ns))
self.events_per_stage[stage_name]["CPU"]['ALL'][
runtimenode.thread_id][runtimenode.type]['events'].append(
runtimenode)
self.events_per_stage[stage_name]["CPU"]['ALL'][
runtimenode.thread_id][runtimenode.type]['times'].append(
(runtimenode.start_ns, runtimenode.end_ns))
for devicenode in runtimenode.device_node:
self.has_device = True
self.events_per_stage[stage_name]["GPU"][stage_idx][
devicenode.stream_id][
devicenode.type]['events'].append(devicenode)
self.events_per_stage[stage_name]["GPU"][stage_idx][
devicenode.stream_id][devicenode.type]['times'].append(
(devicenode.start_ns, devicenode.end_ns))
self.events_per_stage[stage_name]["GPU"]['ALL'][
devicenode.stream_id][
devicenode.type]['events'].append(devicenode)
self.events_per_stage[stage_name]["GPU"]['ALL'][
devicenode.stream_id][devicenode.type]['times'].append(
(devicenode.start_ns, devicenode.end_ns))
def _parse_events(self, nodetrees):
node_wrapped_trees = rebuild_node_trees(nodetrees)
node_wrapped_threadlist = traverse_tree(node_wrapped_trees)
# analyse user-defined summary
for threadid, wrapped_nodes in node_wrapped_threadlist.items():
for wrapped_node in wrapped_nodes[1:]: # skip root node
if wrapped_node.type == 'PythonUserDefined':
self.add_userdefined_item(wrapped_node)
# analyse all events in per stage
thread_count = 0
for threadid, root_wrapped_node in node_wrapped_trees.items():
thread_count += 1
wrapped_profiler_step_nodes = []
for wrapped_node in root_wrapped_node.children_node:
wrapped_profiler_step_nodes.append(wrapped_node)
self.stage_nums = 0
current_stage_idx = None
for wrapped_profiler_step_node in wrapped_profiler_step_nodes:
if wrapped_profiler_step_node.type == 'ProfileStep':
self.process_id = wrapped_profiler_step_node.process_id
stage_idx = wrapped_profiler_step_node.name.split('#')[1]
total_time = 0
accumulated_stage_time = 0
if thread_count == 1:
self.add_model_perspective_item(
wrapped_profiler_step_node)
self.model_perspective_items['ProfileStep'].cpu_times[
stage_idx] = wrapped_profiler_step_node.cpu_time
total_time = wrapped_profiler_step_node.cpu_time
self.stage_nums += 1
for stage_wrapped_node in wrapped_profiler_step_node.children_node:
if thread_count == 1:
self.add_model_perspective_item(stage_wrapped_node)
if stage_wrapped_node.type in StageType:
self.model_perspective_items[
stage_wrapped_node.type].cpu_times[
stage_idx] = stage_wrapped_node.cpu_time
if stage_wrapped_node.type in StageType:
accumulated_stage_time += stage_wrapped_node.cpu_time
self._fill_stage_events(stage_wrapped_node, stage_idx)
if 'Other' not in self.model_perspective_items:
self.model_perspective_items[
'Other'] = ModelPerspectiveItem('Other')
if thread_count == 1:
self.model_perspective_items['Other'].add_cpu_time(
total_time - accumulated_stage_time)
self.model_perspective_items['Other'].cpu_times[
stage_idx] = total_time - accumulated_stage_time
self._fill_stage_events(
wrapped_profiler_step_node,
stage_idx,
should_recursive=False)
else:
self._fill_stage_events(wrapped_profiler_step_node,
current_stage_idx)
self._fill_stage_events(
root_wrapped_node, current_stage_idx, should_recursive=False)
def add_userdefined_item(self, userdefined_node):
if userdefined_node.name not in self.userdefined_items:
self.userdefined_items[userdefined_node.name] = GeneralItem(
userdefined_node.name)
self.userdefined_items[userdefined_node.name].add_item(
userdefined_node)
def add_memory_manipulation_item(self, memory_manipulation_node):
if memory_manipulation_node.name not in self.memory_manipulation_items:
self.memory_manipulation_items[
memory_manipulation_node.name] = GeneralItem(
memory_manipulation_node.name)
self.memory_manipulation_items[memory_manipulation_node.name].add_item(
memory_manipulation_node)
def add_model_perspective_item(self, model_perspective_node):
if model_perspective_node.type == 'Forward':
name = 'Forward'
elif model_perspective_node.type == 'Backward':
name = 'Backward'
elif model_perspective_node.type == 'Optimization':
name = 'Optimization'
elif model_perspective_node.type == 'Dataloader':
name = 'Dataloader'
elif model_perspective_node.type == 'ProfileStep':
name = 'ProfileStep'
else:
return
if name not in self.model_perspective_items:
self.model_perspective_items[name] = ModelPerspectiveItem(name)
self.model_perspective_items[name].add_cpu_time(
model_perspective_node.cpu_time)
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
class TraceParser:
def __init__(self):
pass
def parse(self, content):
self.content = content
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
StageType = ['Dataloader', 'Forward', 'Backward', 'Optimization']
def sum_ranges(ranges):
result = 0
for time_range in ranges:
result += (time_range[1] - time_range[0])
return result
def merge_self_ranges(src_ranges, is_sorted=False):
merged_ranges = []
if len(src_ranges) > 0:
if not is_sorted:
src_ranges.sort(key=lambda x: x[0])
cur_indx = 0
merged_ranges.append((src_ranges[cur_indx][0],
src_ranges[cur_indx][1]))
for cur_indx in range(1, len(src_ranges)):
if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0],
src_ranges[cur_indx][1])
else:
merged_ranges.append((src_ranges[cur_indx][0],
src_ranges[cur_indx][1]))
return merged_ranges
def merge_ranges(range_list1, range_list2, is_sorted=False): # noqa:C901
merged_ranges = []
if not is_sorted:
range_list1 = merge_self_ranges(range_list1)
range_list2 = merge_self_ranges(range_list2)
len1 = len(range_list1)
len2 = len(range_list2)
if len1 == 0 and len2 == 0:
return merged_ranges
elif len1 == 0:
return range_list2
elif len2 == 0:
return range_list1
else:
indx1 = 0
indx2 = 0
range1 = range_list1[indx1]
range2 = range_list2[indx2]
if range1[0] < range2[0]:
merged_ranges.append(range1)
indx1 += 1
else:
merged_ranges.append(range2)
indx2 += 1
while indx1 < len1 and indx2 < len2:
range1 = range_list1[indx1]
range2 = range_list2[indx2]
if range1[0] < range2[0]:
if range1[1] > merged_ranges[-1][1]:
if range1[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
else:
merged_ranges.append((range1[0], range1[1]))
indx1 += 1
else:
indx1 += 1
else:
if range2[1] > merged_ranges[-1][1]:
if range2[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
else:
merged_ranges.append((range2[0], range2[1]))
indx2 += 1
else:
indx2 += 1
while indx1 < len1:
range1 = range_list1[indx1]
if range1[1] > merged_ranges[-1][1]:
if range1[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
else:
merged_ranges.append((range1[0], range1[1]))
indx1 += 1
else:
indx1 += 1
while indx2 < len2:
range2 = range_list2[indx2]
if range2[1] > merged_ranges[-1][1]:
if range2[0] <= merged_ranges[-1][1]:
merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
else:
merged_ranges.append((range2[0], range2[1]))
indx2 += 1
else:
indx2 += 1
return merged_ranges
def intersection_ranges(range_list1, range_list2, is_sorted=False):
result_range = []
if len(range_list1) == 0 or len(range_list2) == 0:
return result_range
if not is_sorted:
range_list1 = merge_self_ranges(range_list1)
range_list2 = merge_self_ranges(range_list2)
len1 = len(range_list1)
len2 = len(range_list2)
indx1 = 0
indx2 = 0
range1 = range_list1[indx1]
range2 = range_list2[indx2]
while indx1 < len1 and indx2 < len2:
if range2[1] <= range1[0]:
indx2 += 1
if indx2 == len2:
break
range2 = range_list2[indx2]
elif range2[0] <= range1[0] and range2[1] < range1[1]:
assert (range2[1] > range1[0])
result_range.append((range1[0], range2[1]))
range1 = (range2[1], range1[1])
indx2 += 1
if indx2 == len2:
break
range2 = range_list2[indx2]
elif range2[0] <= range1[0]:
assert (range2[1] >= range1[1])
result_range.append(range1)
range2 = (range1[1], range2[1])
indx1 += 1
if indx1 == len1:
break
range1 = range_list1[indx1]
elif range2[1] < range1[1]:
assert (range2[0] > range1[0])
result_range.append(range2)
range1 = (range2[1], range1[1])
indx2 += 1
if indx2 == len2:
break
range2 = range_list2[indx2]
elif range2[0] < range1[1]:
assert (range2[1] >= range1[1])
result_range.append((range2[0], range1[1]))
range2 = (range1[1], range2[1])
indx1 += 1
if indx1 == len1:
break
range1 = range_list1[indx1]
else:
assert (range2[0] >= range1[1])
indx1 += 1
if indx1 == len1:
break
range1 = range_list1[indx1]
return result_range
def subtract_ranges(range_list1, range_list2, is_sorted=False):
result_range = []
if not is_sorted:
range_list1 = merge_self_ranges(range_list1)
range_list2 = merge_self_ranges(range_list2)
if len(range_list1) == 0:
return result_range
if len(range_list2) == 0:
return range_list1
len1 = len(range_list1)
len2 = len(range_list2)
indx1 = 0
indx2 = 0
range1 = range_list1[indx1]
range2 = range_list2[indx2]
while indx1 < len(range_list1):
if indx2 == len(range_list2):
result_range.append(range1)
indx1 += 1
if indx1 == len1:
break
range1 = range_list1[indx1]
elif range2[1] <= range1[0]:
indx2 += 1
if indx2 != len2:
range2 = range_list2[indx2]
elif range2[0] <= range1[0] and range2[1] < range1[1]:
range1 = (range2[1], range1[1])
indx2 += 1
if indx2 != len2:
range2 = range_list2[indx2]
elif range2[0] <= range1[0]:
assert (range2[1] >= range1[1])
range2 = (range1[1], range2[1])
indx1 += 1
if indx1 != len1:
range1 = range_list1[indx1]
elif range2[0] < range1[1]:
assert (range2[0] > range1[0])
result_range.append((range1[0], range2[0]))
range1 = (range2[0], range1[1])
else:
assert (range2[0] >= range1[1])
result_range.append(range1)
indx1 += 1
if indx1 != len1:
range1 = range_list1[indx1]
return result_range
class HostStatisticNode:
r'''
Wrap original node for calculating statistic metrics.
'''
def __init__(self, hostnode):
self.hostnode = hostnode
self.children_node = []
self.runtime_node = []
self.cpu_time = 0
self.self_cpu_time = 0
self.gpu_time = 0 # kernel time
self.self_gpu_time = 0
self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset
self.self_general_gpu_time = 0
self.is_terminal_operator_node = True
def cal_statistic(self):
for child in self.children_node:
child.cal_statistic()
if child.is_terminal_operator_node is False:
self.is_terminal_operator_node = False
for rt in self.runtime_node:
rt.cal_statistic()
self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
self.self_cpu_time = self.cpu_time
for child in self.children_node:
if child.type == 'Operator':
self.is_terminal_operator_node = False
self.gpu_time += child.gpu_time
self.general_gpu_time += child.general_gpu_time
self.self_cpu_time -= (child.end_ns - child.start_ns)
for rt in self.runtime_node:
self.self_cpu_time -= (rt.end_ns - rt.start_ns)
self.gpu_time += rt.gpu_time
self.self_gpu_time += rt.gpu_time
self.general_gpu_time += rt.general_gpu_time
self.self_general_gpu_time += rt.general_gpu_time
for device in self.hostnode.device_node:
if device.type == 'Kernel':
self.gpu_time += (device.end_ns - device.start_ns)
self.self_gpu_time += (device.end_ns - device.start_ns)
self.general_gpu_time += (device.end_ns - device.start_ns)
self.self_general_gpu_time += (device.end_ns - device.start_ns)
@property
def end_ns(self):
return self.hostnode.end_ns
@property
def start_ns(self):
return self.hostnode.start_ns
def __getattr__(self, name):
return getattr(self.hostnode, name)
def traverse_tree(nodetrees):
results = collections.defaultdict(list)
for thread_id, rootnode in nodetrees.items():
stack = []
stack.append(rootnode)
threadlist = results[thread_id]
while stack:
current_node = stack.pop()
threadlist.append(current_node)
for childnode in current_node.children_node:
stack.append(childnode)
return results
def get_device_nodes(hostnode):
'''
Get all device nodes called in the time range of hostnode.
'''
stack = []
device_nodes = []
stack.append(hostnode)
while stack:
current_node = stack.pop()
for childnode in current_node.children_node:
stack.append(childnode)
for runtimenode in current_node.runtime_node:
for devicenode in runtimenode.device_node:
device_nodes.append(devicenode)
return device_nodes
def wrap_tree(nodetrees):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
'''
node_statistic_tree = {}
results = collections.defaultdict(list)
newresults = collections.defaultdict(list)
for thread_id, rootnode in nodetrees.items():
stack = []
stack.append(rootnode)
root_statistic_node = HostStatisticNode(rootnode)
newstack = []
newstack.append(root_statistic_node)
node_statistic_tree[thread_id] = root_statistic_node
threadlist = results[thread_id]
newthreadlist = newresults[thread_id]
while stack:
current_node = stack.pop()
threadlist.append(current_node)
current_statistic_node = newstack.pop()
newthreadlist.append(current_statistic_node)
for childnode in current_node.children_node:
stack.append(childnode)
child_statistic_node = HostStatisticNode(childnode)
current_statistic_node.children_node.append(
child_statistic_node)
newstack.append(child_statistic_node)
for runtimenode in current_node.runtime_node:
runtime_statistic_node = HostStatisticNode(runtimenode)
current_statistic_node.runtime_node.append(
runtime_statistic_node)
# recursive calculate node statistic values
for thread_id, root_statistic_node in node_statistic_tree.items():
root_statistic_node.cal_statistic()
return node_statistic_tree, newresults
def rebuild_node_trees(nodetrees): # noqa:C901
template_root = None
# First, we find the tree which includes Forward event.
for threadid, root in nodetrees.items():
has_find_template_root = False
template_root = HostStatisticNode(root)
for children in root.children_node:
if children.type == 'ProfileStep':
profiler_step_node = HostStatisticNode(children)
template_root.children_node.append(profiler_step_node)
has_find_template_root = True
for stage_node in children.children_node:
if stage_node.type in StageType:
profiler_step_node.children_node.append(
HostStatisticNode(stage_node))
else:
break
if has_find_template_root is True:
break
if template_root is None:
print('No profiler steps found, overview page will have no data.')
wrapped_tree = {}
for thread_id, rootnode in nodetrees.items():
has_find_template_root = False
for children in rootnode.children_node:
if children.type == 'ProfileStep':
has_find_template_root = True
break
unwrapped_stack = []
warpped_stack = []
root_statistic_node = HostStatisticNode(rootnode)
wrapped_tree[thread_id] = root_statistic_node
if has_find_template_root is False:
for profiler_step_node in template_root.children_node:
profiler_step_wrap_node = HostStatisticNode(
profiler_step_node.hostnode)
root_statistic_node.children_node.append(
profiler_step_wrap_node)
for stage_node in profiler_step_node.children_node:
stage_wrap_node = HostStatisticNode(stage_node.hostnode)
profiler_step_wrap_node.children_node.append(
stage_wrap_node)
# insert nodes in original root into new stage nodes
# algorithm: post order traversal the tree
stack = []
flag_stack = []
post_order_nodes = []
stack.append(root_statistic_node)
flag_stack.append(0)
while stack:
current_node = stack.pop()
flag = flag_stack.pop()
if flag == 0:
stack.append(current_node)
flag_stack.append(1)
for children_node in reversed(current_node.children_node):
stack.append(children_node)
flag_stack.append(0)
else:
post_order_nodes.append(current_node)
# traverse post_order_nodes and insert right position
for runtimenode in rootnode.runtime_node:
runtime_wrapped_node = HostStatisticNode(runtimenode)
root_statistic_node.runtime_node.append(runtime_wrapped_node)
for node in rootnode.children_node:
unwrapped_stack.append(node)
for wrapped_node in post_order_nodes:
if node.start_ns >= wrapped_node.start_ns and node.end_ns <= wrapped_node.end_ns:
child_wrapped_node = HostStatisticNode(node)
warpped_stack.append(child_wrapped_node)
wrapped_node.children_node.append(child_wrapped_node)
break
else:
unwrapped_stack.append(rootnode)
warpped_stack.append(root_statistic_node)
while unwrapped_stack:
current_node = unwrapped_stack.pop()
current_wrapped_node = warpped_stack.pop()
for childnode in current_node.children_node:
unwrapped_stack.append(childnode)
child_wrapped_node = HostStatisticNode(childnode)
current_wrapped_node.children_node.append(child_wrapped_node)
warpped_stack.append(child_wrapped_node)
for runtimenode in current_node.runtime_node:
runtime_wrapped_node = HostStatisticNode(runtimenode)
current_wrapped_node.runtime_node.append(runtime_wrapped_node)
# recursive calculate node statistic values
for thread_id, root_wrapped_node in wrapped_tree.items():
root_wrapped_node.cal_statistic()
return wrapped_tree
def format_time(time, unit='ms', inf_subs='-'):
r"""
Transform time in ns to time in unit.
"""
if time == float('inf'):
return inf_subs
else:
result = float(time)
if unit == 's':
result /= 1e9
elif unit == 'ms':
result /= 1e6
elif unit == 'us':
result /= 1e3
# return '{:.2f}'.format(result)
return round(result, 2)
def format_ratio(ratio):
r"""
Transform ratio within [0, 1] to percentage presentation.
"""
# return '{:.2f}'.format(ratio * 100)
return round(ratio * 100, 2)
def format_float(float_data):
return round(float_data, 2)
def format_memory(memory, memory_unit='KB'):
result = float(memory)
if memory_unit == 'GB':
result /= (1024 * 1024 * 1024)
elif memory_unit == 'MB':
result /= (1024 * 1024)
elif memory_unit == 'KB':
result /= 1024
# return '{:.2f}'.format(result)
return round(result, 2)
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from collections import defaultdict
from collections import OrderedDict
from .parser.distributed_parser import DistributedParser
from .parser.kernel_parser import KernelParser
from .parser.memory_parser import MemoryParser
from .parser.operator_parser import OperatorParser
from .parser.overview_parser import CPUType
from .parser.overview_parser import GPUType
from .parser.overview_parser import OverviewParser
from .parser.trace_parser import TraceParser
from .parser.utils import format_float
from .parser.utils import format_memory
from .parser.utils import format_ratio
from .parser.utils import format_time
from .parser.utils import traverse_tree
def filter_type(node_trees):
nodelists = traverse_tree(node_trees)
for thread_id, nodelist in nodelists.items():
for node in nodelist:
if not isinstance(node.type, str):
node.type = str(node.type).split('.')[1]
class ProfilerData:
'''
Hold all parsed data to serve for user requests.
'''
def __init__(self, run, worker_name, span_indx, profiler_result):
self.run = run
self.worker_name = worker_name
self.span_indx = span_indx
self.node_trees = profiler_result.get_data()
filter_type(self.node_trees)
self.extra_infos = profiler_result.get_extra_info()
self.span_idx = profiler_result.get_span_idx()
self.device_infos = profiler_result.get_device_infos()
self.overview_parser = None
self.operator_parser = None
self.distributed_parser = None
self.memory_parser = None
self.kernel_parser = None
self.trace_parser = None
self.has_gpu = profiler_result.has_device()
if profiler_result.has_host():
# overview parser
self.overview_parser = OverviewParser()
self.overview_parser.parse(self.node_trees)
self.merged_events_per_stage = self.overview_parser.merged_events_per_stage
self.model_perspective_items = self.overview_parser.model_perspective_items
self.userdefined_items = self.overview_parser.userdefined_items
self.gpu_ulitization = self.overview_parser.gpu_ulitization
self.process_id = self.overview_parser.process_id
# operator parser
self.operator_parser = OperatorParser()
self.operator_parser.parse(self.node_trees)
self.operator_items = self.operator_parser.items
self.operator_items_with_input_shape = self.operator_parser.items_with_input_shape
# distributed parser
if profiler_result.has_device():
self.distributed_parser = DistributedParser()
self.distributed_parser.parse(self.node_trees)
self.distributed_time = self.distributed_parser.steps_time
if profiler_result.has_memory():
# memory parser
self.memory_parser = MemoryParser()
self.memory_parser.parse(self.node_trees)
self.memory_curve = self.memory_parser.memory_curve
self.allocated_items = self.memory_parser.allocated_items
self.reserved_items = self.memory_parser.reserved_items
self.paired_events = self.memory_parser.paired_events
self.size_ranges = self.memory_parser.size_ranges
self.peak_allocation_values = self.memory_parser.peak_allocation_values
if profiler_result.has_device():
# kernel parser
self.kernel_parser = KernelParser()
self.kernel_parser.parse(traverse_tree(self.node_trees))
self.kernel_items = self.kernel_parser.kernel_items
self.kernel_items_with_op_name_attributes = self.kernel_parser.kernel_items_with_op_name_attributes
self.occupancy = self.kernel_parser.occupancy
self.sm_efficiency = self.kernel_parser.sm_efficiency
self.tensorcore_ratio = self.kernel_parser.tensor_core_ratio
self.gpu_ids = self.kernel_parser.gpu_ids
# trace parser
self.trace_parser = TraceParser()
self.trace_parser.parse(profiler_result.content)
def get_views(self):
'''
Return available views this profile data can provide.
'''
views = []
if self.overview_parser:
if self.overview_parser.has_forward:
views.append('Overview')
if self.operator_parser:
if self.operator_items:
views.append('Operator')
if self.kernel_parser:
if self.kernel_items:
views.append('GPU Kernel')
if self.memory_parser:
if self.memory_curve:
views.append('Memory')
if self.distributed_parser:
if self.distributed_time:
views.append('Distributed')
views.append('Trace')
return views
def get_device_infos(self):
if not self.overview_parser.has_device:
device_type = 'CPU'
return {
"device_type": device_type,
"CPU": {
"process_utilization":
format_ratio(
float(self.extra_infos["Process Cpu Utilization"])),
"system_utilization":
format_ratio(
float(self.extra_infos["System Cpu Utilization"]))
}
}
else:
device_type = 'GPU'
gpu_id = int(next(iter(self.gpu_ids)))
return {
"device_type": device_type,
"CPU": {
"process_utilization":
format_ratio(
float(self.extra_infos["Process Cpu Utilization"])),
"system_utilization":
format_ratio(
float(self.extra_infos["System Cpu Utilization"]))
},
"GPU": {
"name":
self.device_infos[gpu_id]['name'],
"memory":
"{} GB".format(
format_memory(
self.device_infos[gpu_id]['totalGlobalMem'],
'GB')),
"compute_capability":
'{}.{}'.format(self.device_infos[gpu_id]['computeMajor'],
self.device_infos[gpu_id]['computeMinor']),
"utilization":
format_ratio(self.gpu_ulitization),
"sm_efficiency":
format_ratio(
self.sm_efficiency /
self.model_perspective_items['ProfileStep'].cpu_time),
"achieved_occupancy":
format_ratio(self.occupancy),
"tensor_core_percentage":
format_ratio(self.tensorcore_ratio)
}
}
def get_model_perspective(self, time_unit):
'''
Get total cpu and gpu statistics for model perspective of each profiler step.
'''
data = OrderedDict()
data['column_name'] = [
"name", "calls", "total_time", "avg_time", "max_time", "min_time",
"ratio"
]
data['cpu'] = []
if self.overview_parser.has_device:
data['gpu'] = []
total_cpu_time = self.model_perspective_items['ProfileStep'].cpu_time
total_gpu_time = self.model_perspective_items['ProfileStep'].gpu_time
for stage_name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization', 'Other'
]:
if stage_name in self.model_perspective_items:
cpu_stage_data = OrderedDict()
cpu_stage_data['name'] = stage_name
cpu_stage_data['calls'] = self.model_perspective_items[
stage_name].call
cpu_stage_data['total_time'] = format_time(
self.model_perspective_items[stage_name].cpu_time,
time_unit)
cpu_stage_data['avg_time'] = format_time(
self.model_perspective_items[stage_name].avg_cpu_time,
time_unit)
cpu_stage_data['max_time'] = format_time(
self.model_perspective_items[stage_name].max_cpu_time,
time_unit)
cpu_stage_data['min_time'] = format_time(
self.model_perspective_items[stage_name].min_cpu_time,
time_unit,
inf_subs=0)
cpu_stage_data['ratio'] = format_ratio(
self.model_perspective_items[stage_name].cpu_time /
total_cpu_time)
if self.overview_parser.has_device:
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = stage_name
gpu_stage_data['calls'] = self.model_perspective_items[
stage_name].call
gpu_stage_data['total_time'] = format_time(
self.model_perspective_items[stage_name].gpu_time,
time_unit)
gpu_stage_data['avg_time'] = format_time(
self.model_perspective_items[stage_name].avg_gpu_time,
time_unit)
gpu_stage_data['max_time'] = format_time(
self.model_perspective_items[stage_name].max_gpu_time,
time_unit)
gpu_stage_data['min_time'] = format_time(
self.model_perspective_items[stage_name].min_gpu_time,
time_unit,
inf_subs=0)
gpu_stage_data['ratio'] = format_ratio(
self.model_perspective_items[stage_name].gpu_time /
total_gpu_time)
data['cpu'].append(cpu_stage_data)
if self.overview_parser.has_device:
data['gpu'].append(gpu_stage_data)
return data
def get_model_perspective_perstep(self, device_type, time_unit):
try:
data = OrderedDict()
data['order'] = []
steps = [
int(step_id) for step_id in
self.model_perspective_items['ProfileStep'].cpu_times.keys()
]
steps = sorted(steps)
data['steps'] = steps
for stage_name in [
'Dataloader', 'Forward', 'Backward', 'Optimization',
'Other'
]:
if stage_name not in self.model_perspective_items:
continue
data['order'].append(stage_name)
data[stage_name] = []
for stage_idx in steps:
stage_idx = str(stage_idx)
if device_type == 'cpu':
if stage_idx in self.model_perspective_items[
stage_name].cpu_times:
data[stage_name].append(
format_time(
self.model_perspective_items[stage_name].
cpu_times[stage_idx], time_unit))
else:
data[stage_name].append(0)
else:
if stage_idx in self.model_perspective_items[
stage_name].gpu_times:
data[stage_name].append(
format_time(
self.model_perspective_items[stage_name].
gpu_times[stage_idx], time_unit))
else:
data[stage_name].append(0)
except Exception as e:
print('error in get_model_perspective_perstep', e)
new_data = {}
new_data['order'] = data['order']
new_data['steps'] = data['steps']
new_data['data'] = []
for name in new_data['order']:
new_data['data'].append(data[name])
return new_data
def get_event_type_perspective(self, device_type, time_unit):
data = OrderedDict()
data['order'] = []
if device_type == 'cpu':
for event_type in CPUType:
event_type_data = {}
event_type_data['calling_times'] = {}
event_type_data['calling_times']['key'] = []
event_type_data['calling_times']['value'] = []
event_type_data['durations'] = {}
event_type_data['durations']['key'] = []
event_type_data['durations']['value'] = []
event_type_data['ratios'] = {}
event_type_data['ratios']['key'] = []
event_type_data['ratios']['value'] = []
for stage_name in [
'Dataloader', 'Forward', 'Backward', 'Optimization',
'Other'
]:
if stage_name in self.merged_events_per_stage:
if event_type in self.merged_events_per_stage[
stage_name]['CPU']['ALL']:
event_type_data['calling_times']['key'].append(
stage_name)
event_type_data['durations']['key'].append(
stage_name)
event_type_data['ratios']['key'].append(stage_name)
event_type_data['calling_times']['value'].append(
self.merged_events_per_stage[stage_name]['CPU']
['ALL'][event_type]['calls'])
event_type_data['durations']['value'].append(
format_time(
self.merged_events_per_stage[stage_name]
['CPU']['ALL'][event_type]['total_time'],
time_unit))
event_type_data['ratios']['value'].append(
format_ratio(
self.merged_events_per_stage[stage_name]
['CPU']['ALL'][event_type]['total_time'] /
self.merged_events_per_stage['ProfileStep']
['CPU']['ALL'][event_type]['total_time']))
if event_type_data['calling_times']['key']:
data[event_type] = event_type_data
data['order'].append(event_type)
else:
for event_type in GPUType:
event_type_data = {}
event_type_data['calling_times'] = {}
event_type_data['calling_times']['key'] = []
event_type_data['calling_times']['value'] = []
event_type_data['durations'] = {}
event_type_data['durations']['key'] = []
event_type_data['durations']['value'] = []
event_type_data['ratios'] = {}
event_type_data['ratios']['key'] = []
event_type_data['ratios']['value'] = []
for stage_name in [
'Dataloader', 'Forward', 'Backward', 'Optimization',
'Other'
]:
if stage_name in self.merged_events_per_stage:
if event_type in self.merged_events_per_stage[
stage_name]['GPU']['ALL']:
event_type_data['calling_times']['key'].append(
stage_name)
event_type_data['durations']['key'].append(
stage_name)
event_type_data['ratios']['key'].append(stage_name)
event_type_data['calling_times']['value'].append(
self.merged_events_per_stage[stage_name]['GPU']
['ALL'][event_type]['calls'])
event_type_data['durations']['value'].append(
format_time(
self.merged_events_per_stage[stage_name]
['GPU']['ALL'][event_type]['total_time'],
time_unit))
event_type_data['ratios']['value'].append(
format_ratio(
self.merged_events_per_stage[stage_name]
['GPU']['ALL'][event_type]['total_time'] /
self.merged_events_per_stage['ProfileStep']
['GPU']['ALL'][event_type]['total_time']))
if event_type_data['calling_times']['key']:
data[event_type] = event_type_data
data['order'].append(event_type)
return data
def get_event_type_model_perspective(self, time_unit): # noqa: C901
data = OrderedDict()
data['order'] = []
data['phase_type'] = []
try:
for event_type in CPUType:
if event_type in self.merged_events_per_stage['ProfileStep'][
'CPU']['ALL']:
data['order'].append(event_type)
data[event_type] = []
if self.overview_parser.has_device:
for event_type in GPUType:
if event_type in self.merged_events_per_stage[
'ProfileStep']['GPU']['ALL']:
data['order'].append(event_type)
data[event_type] = []
for stage_name in [
'ProfileStep', 'Dataloader', 'Forward', 'Backward',
'Optimization', 'Other'
]:
if stage_name in self.merged_events_per_stage:
data['phase_type'].append(stage_name)
for event_type in data['order']:
if event_type in CPUType:
if event_type in self.merged_events_per_stage[
stage_name]['CPU']['ALL']:
data[event_type].append(
format_time(
self.merged_events_per_stage[
stage_name]['CPU']['ALL']
[event_type]['total_time'], time_unit))
else:
data[event_type].append(0)
elif event_type in GPUType:
if event_type in self.merged_events_per_stage[
stage_name]['GPU']['ALL']:
data[event_type].append(
format_time(
self.merged_events_per_stage[
stage_name]['GPU']['ALL']
[event_type]['total_time'], time_unit))
else:
data[event_type].append(0)
newdata = OrderedDict()
newdata['order'] = data['order']
newdata['phase_type'] = data['phase_type']
newdata['data'] = []
for key in newdata['order']:
newdata['data'].append(data[key])
except Exception as e:
print('error in get_event_type_model_perspective', e)
return newdata
def get_userdefined_perspective(self, time_unit):
data = OrderedDict()
if self.overview_parser.has_device:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio', 'gpu_total_time',
'gpu_avg_time', 'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
]
data['has_gpu'] = True
else:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
]
data['has_gpu'] = False
data['events'] = []
total_cpu_time = 0
total_gpu_time = 0
for name, event in self.userdefined_items.items():
total_cpu_time += event.cpu_time
total_gpu_time += event.general_gpu_time
for name, event in self.userdefined_items.items():
if self.overview_parser.has_device:
data['events'].append({
"name":
name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time, time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time, time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time, time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time / total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
})
return data
def get_operator_pie(self, topk, time_unit='ms'):
data = OrderedDict()
data['column_name'] = [
"name", "calls", "total_time", "avg_time", "max_time", "min_time",
"ratio"
]
data['cpu'] = []
if self.has_gpu:
data['gpu'] = []
gpu_sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
cpu_sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
if topk <= 0:
cpu_items = cpu_sorted_items
if self.has_gpu:
gpu_items = gpu_sorted_items
else:
cpu_items = cpu_sorted_items[:topk]
if self.has_gpu:
gpu_items = gpu_sorted_items[:topk]
total_cpu_time = 0.0
total_gpu_time = 0.0
for op_name, item in cpu_items:
total_cpu_time += item.cpu_time
if self.has_gpu:
for op_name, item in gpu_items:
total_gpu_time += item.general_gpu_time
for op_name, item in cpu_items:
cpu_stage_data = OrderedDict()
cpu_stage_data['name'] = op_name
cpu_stage_data['calls'] = item.call
cpu_stage_data['total_time'] = format_time(item.cpu_time,
time_unit)
cpu_stage_data['avg_time'] = format_time(item.avg_cpu_time,
time_unit)
cpu_stage_data['max_time'] = format_time(item.max_cpu_time,
time_unit)
cpu_stage_data['min_time'] = format_time(item.min_cpu_time,
time_unit)
cpu_stage_data['ratio'] = format_ratio(
item.cpu_time / total_cpu_time)
data['cpu'].append(cpu_stage_data)
if self.has_gpu:
for op_name, item in gpu_items:
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = op_name
gpu_stage_data['calls'] = item.call
gpu_stage_data['total_time'] = format_time(
item.general_gpu_time, time_unit)
gpu_stage_data['avg_time'] = format_time(
item.avg_general_gpu_time, time_unit)
gpu_stage_data['max_time'] = format_time(
item.max_general_gpu_time, time_unit)
gpu_stage_data['min_time'] = format_time(
item.min_general_gpu_time, time_unit)
gpu_stage_data['ratio'] = format_ratio(
item.general_gpu_time / total_gpu_time)
data['gpu'].append(gpu_stage_data)
return data
def get_operator_pie_expand( # noqa: C901
self, topk, device_type, time_unit):
data = OrderedDict()
data['order'] = []
data['phase_type'] = []
data['data'] = []
if device_type == 'cpu':
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
else:
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
if topk <= 0 or topk >= 20:
items = sorted_items[:20]
other_items = sorted_items[20:]
else:
items = sorted_items[:topk]
other_items = []
data['order'].extend(
['infer_shape', 'compute', 'node_creation', 'others'])
inner_op_data = defaultdict(list)
for op_name, event in items:
data['phase_type'].append(op_name)
innerop_knownsub_times = 0
have_innerop_name = set()
for innerop_name, item in event.operator_inners.items():
if 'infer_shape' in innerop_name or 'infer_meta' in innerop_name:
innerop_name = 'infer_shape'
elif 'compute' in innerop_name:
innerop_name = 'compute'
elif 'node_creation' in innerop_name:
innerop_name = 'node_creation'
else:
continue
have_innerop_name.add(innerop_name)
if device_type == 'cpu':
inner_op_data[innerop_name].append(
format_time(item.cpu_time, time_unit))
innerop_knownsub_times += item.cpu_time
else:
inner_op_data[innerop_name].append(
format_time(item.general_gpu_time, time_unit))
innerop_knownsub_times += item.general_gpu_time
if device_type == 'cpu':
inner_op_data['others'].append(
format_time(event.cpu_time - innerop_knownsub_times,
time_unit))
else:
inner_op_data['others'].append(
format_time(
event.general_gpu_time - innerop_knownsub_times,
time_unit))
have_innerop_name.add('others')
for innerop_name in data['order']:
if innerop_name in have_innerop_name:
continue
else:
inner_op_data[innerop_name].append(0)
if other_items:
innerop_knownsub_times = 0
total_event_times = 0
data['phase_type'].append('others')
others_time = defaultdict(float)
for op_name, event in other_items:
for innerop_name, item in event.operator_inners.items():
if 'infer_shape' in innerop_name:
innerop_name = 'infer_shape'
elif 'compute' in innerop_name:
innerop_name = 'compute'
elif 'node_creation' in innerop_name:
innerop_name = 'node_creation'
else:
continue
if device_type == 'cpu':
others_time[innerop_name] += item.cpu_time
innerop_knownsub_times += item.cpu_time
else:
others_time[innerop_name] += item.general_gpu_time
innerop_knownsub_times += item.general_gpu_time
if device_type == 'cpu':
total_event_times += event.cpu_time
else:
total_event_times += event.general_gpu_time
others_time['others'] = total_event_times - innerop_knownsub_times
for innerop_name in data['order']:
if innerop_name not in others_time:
others_time[innerop_name] = 0.0
inner_op_data[innerop_name].append(
format_time(others_time[innerop_name], time_unit))
for innerop_name in data['order']:
data['data'].append(inner_op_data[innerop_name])
return data
def get_operator_table( # noqa: C901 Todo: Optimize code
self,
group_by='op_name',
search_name=None,
time_unit='ms'):
def get_children_data(event):
datas = []
for innerop_name, item in event.operator_inners.items():
if item.cpu_time == 0:
cpu_ratio = 0
else:
cpu_ratio = float(item.cpu_time) / event.cpu_time
if item.general_gpu_time == 0:
gpu_ratio = 0
else:
gpu_ratio = float(
item.general_gpu_time) / event.general_gpu_time
data = {
"name":
innerop_name,
"calls":
item.call,
"cpu_total_time":
format_time(item.cpu_time, time_unit),
"cpu_avg_time":
format_time(item.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(item.max_cpu_time, time_unit),
"cpu_min_time":
format_time(item.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(cpu_ratio),
"gpu_total_time":
format_time(item.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(item.avg_general_gpu_time, time_unit),
"gpu_max_time":
format_time(item.max_general_gpu_time, time_unit),
"gpu_min_time":
format_time(item.min_general_gpu_time, time_unit),
"gpu_ratio":
format_ratio(gpu_ratio)
}
datas.append(data)
return datas
data = OrderedDict()
data['events'] = []
total_cpu_time = 0
total_gpu_time = 0
for name, event in self.operator_items.items():
total_cpu_time += event.cpu_time
total_gpu_time += event.general_gpu_time
if not search_name:
if group_by == 'op_name':
if self.has_gpu:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio',
'gpu_total_time', 'gpu_avg_time', 'gpu_max_time',
'gpu_min_time', 'gpu_ratio'
]
data['has_gpu'] = True
else:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
]
data['has_gpu'] = False
if self.has_gpu:
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
else:
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
for name, event in sorted_items:
if self.has_gpu:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
name,
"calls":
event.call,
"children":
children_events,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
name,
"calls":
event.call,
"children":
children_events,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
if self.has_gpu:
data['column_name'] = [
'name', 'calls', 'input_shape', 'cpu_total_time',
'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
'cpu_ratio', 'gpu_total_time', 'gpu_avg_time',
'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
]
data['has_gpu'] = True
else:
data['column_name'] = [
'name', 'calls', 'input_shape', 'cpu_total_time',
'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
'cpu_ratio'
]
data['has_gpu'] = False
new_arrange_data = {}
for op_name, items_with_input_shape in self.operator_items_with_input_shape.items(
):
for input_shape, item in items_with_input_shape.items():
new_arrange_data[(op_name, input_shape)] = item
if self.has_gpu:
sorted_items = sorted(
new_arrange_data.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
else:
sorted_items = sorted(
new_arrange_data.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
for (name, input_shape), event in sorted_items:
if not input_shape:
shape_string = []
else:
shapes = input_shape.split('\t')[:-1]
shape_string = [
'{}:{}'.format(*shape.split('-'))
for shape in shapes
]
if self.has_gpu:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
name,
"calls":
event.call,
"children":
children_events,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
name,
"calls":
event.call,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
name,
"calls":
event.call,
"children":
children_events,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
name,
"calls":
event.call,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
if self.has_gpu:
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].general_gpu_time,
reverse=True)
else:
sorted_items = sorted(
self.operator_items.items(),
key=lambda x: x[1].cpu_time,
reverse=True)
results = []
for op_name, item in sorted_items:
if search_name in op_name:
results.append(op_name)
if group_by == 'op_name':
if self.has_gpu:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio',
'gpu_total_time', 'gpu_avg_time', 'gpu_max_time',
'gpu_min_time', 'gpu_ratio'
]
data['has_gpu'] = True
else:
data['column_name'] = [
'name', 'calls', 'cpu_total_time', 'cpu_avg_time',
'cpu_max_time', 'cpu_min_time', 'cpu_ratio'
]
data['has_gpu'] = False
for op_name in results:
event = self.operator_items[op_name]
if self.has_gpu:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"children":
children_events,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time, time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time
if total_gpu_time != 0 else 0.0)
})
else:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"children":
children_events,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time / total_cpu_time
if total_cpu_time != 0 else 0.0)
})
else:
if self.has_gpu:
data['column_name'] = [
'name', 'calls', 'input_shape', 'cpu_total_time',
'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
'cpu_ratio', 'gpu_total_time', 'gpu_avg_time',
'gpu_max_time', 'gpu_min_time', 'gpu_ratio'
]
data['has_gpu'] = True
else:
data['column_name'] = [
'name', 'calls', 'input_shape', 'cpu_total_time',
'cpu_avg_time', 'cpu_max_time', 'cpu_min_time',
'cpu_ratio'
]
data['has_gpu'] = False
for op_name in results:
for input_shape, event in self.operator_items_with_input_shape[
op_name].items():
if not input_shape:
shape_string = []
else:
shapes = input_shape.split('\t')[:-1]
shape_string = [
'{}:{}'.format(*shape.split('-'))
for shape in shapes
]
if self.has_gpu:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"children":
children_events,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time /
total_cpu_time if
total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time,
time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time if
total_gpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time /
total_cpu_time if
total_cpu_time != 0 else 0.0),
"gpu_total_time":
format_time(event.general_gpu_time,
time_unit),
"gpu_avg_time":
format_time(event.avg_general_gpu_time,
time_unit),
"gpu_max_time":
format_time(event.max_general_gpu_time,
time_unit),
"gpu_min_time":
format_time(event.min_general_gpu_time,
time_unit),
"gpu_ratio":
format_ratio(event.general_gpu_time /
total_gpu_time if
total_gpu_time != 0 else 0.0)
})
else:
children_events = get_children_data(event)
if children_events:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"children":
get_children_data(event),
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time /
total_cpu_time if
total_cpu_time != 0 else 0.0)
})
else:
data['events'].append({
"name":
op_name,
"calls":
event.call,
"input_shape":
shape_string,
"cpu_total_time":
format_time(event.cpu_time, time_unit),
"cpu_avg_time":
format_time(event.avg_cpu_time, time_unit),
"cpu_max_time":
format_time(event.max_cpu_time, time_unit),
"cpu_min_time":
format_time(event.min_cpu_time, time_unit),
"cpu_ratio":
format_ratio(event.cpu_time /
total_cpu_time if
total_cpu_time != 0 else 0.0)
})
return data
def get_kernel_pie(self, topk, time_unit='ms'):
data = OrderedDict()
data['column_name'] = [
"name", "calls", "total_time", "avg_time", "max_time", "min_time",
"mean blocks per sm", "mean est achieved occupancy",
"tensor core used", "ratio"
]
data['events'] = []
sorted_items = sorted(
self.kernel_items.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
if topk <= 0:
items = sorted_items
else:
items = sorted_items[:topk]
total_gpu_time = 0.0
for kernel_name, item in items:
total_gpu_time += item.gpu_time
for kernel_name, item in items:
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = kernel_name
gpu_stage_data['calls'] = item.call
gpu_stage_data['total_time'] = format_time(item.gpu_time,
time_unit)
gpu_stage_data['avg_time'] = format_time(item.avg_gpu_time,
time_unit)
gpu_stage_data['max_time'] = format_time(item.max_gpu_time,
time_unit)
gpu_stage_data['min_time'] = format_time(item.min_gpu_time,
time_unit)
gpu_stage_data['mean blocks per sm'] = format_float(
item.sum_blocks_per_sm / item.call)
gpu_stage_data['mean est achieved occupancy'] = format_float(
item.sum_occupancy / item.call)
gpu_stage_data['tensor core used'] = item.tensorcore_used
gpu_stage_data['ratio'] = format_ratio(
item.gpu_time / total_gpu_time)
data['events'].append(gpu_stage_data)
return data
def get_kernel_table(self, group_by='', search_name=None, time_unit='ms'):
data = OrderedDict()
data['events'] = []
total_gpu_time = 0
for name, event in self.kernel_items.items():
total_gpu_time += event.gpu_time
if not search_name:
if group_by == 'kernel_name':
data['column_name'] = [
"name", "calls", "total_time", "avg_time", "max_time",
"min_time", "mean blocks per sm",
"mean est achieved occupancy", "tensor core used", "ratio"
]
sorted_items = sorted(
self.kernel_items.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
for name, item in sorted_items:
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = name
gpu_stage_data['calls'] = item.call
gpu_stage_data['total_time'] = format_time(
item.gpu_time, time_unit)
gpu_stage_data['avg_time'] = format_time(
item.avg_gpu_time, time_unit)
gpu_stage_data['max_time'] = format_time(
item.max_gpu_time, time_unit)
gpu_stage_data['min_time'] = format_time(
item.min_gpu_time, time_unit)
gpu_stage_data['mean blocks per sm'] = format_float(
item.sum_blocks_per_sm / item.call)
gpu_stage_data[
'mean est achieved occupancy'] = format_float(
item.sum_occupancy / item.call)
gpu_stage_data['tensor core used'] = item.tensorcore_used
gpu_stage_data['ratio'] = format_ratio(
item.gpu_time / total_gpu_time)
data['events'].append(gpu_stage_data)
else:
data['column_name'] = [
"name", "calls", "operator", "grid", "block",
"register per thread", "shared memory", "total_time",
"avg_time", "max_time", "min_time", "mean blocks per sm",
"mean est achieved occupancy", "tensor core used", "ratio"
]
new_arrange_data = {}
for name, items_with_attributes in self.kernel_items_with_op_name_attributes.items(
):
for attributes, item in items_with_attributes.items():
new_arrange_data[(name, attributes)] = item
sorted_items = sorted(
new_arrange_data.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
for (name, attributes), item in sorted_items:
operator, grid, block, register_per_thread, shared_memory = attributes.split(
'-')
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = name
gpu_stage_data['calls'] = item.call
gpu_stage_data['operator'] = operator
gpu_stage_data['grid'] = grid
gpu_stage_data['block'] = block
gpu_stage_data['register per thread'] = register_per_thread
gpu_stage_data['shared memory'] = shared_memory
gpu_stage_data['total_time'] = format_time(
item.gpu_time, time_unit)
gpu_stage_data['avg_time'] = format_time(
item.avg_gpu_time, time_unit)
gpu_stage_data['max_time'] = format_time(
item.max_gpu_time, time_unit)
gpu_stage_data['min_time'] = format_time(
item.min_gpu_time, time_unit)
gpu_stage_data['mean blocks per sm'] = format_float(
item.sum_blocks_per_sm / item.call)
gpu_stage_data[
'mean est achieved occupancy'] = format_float(
item.sum_occupancy / item.call)
gpu_stage_data['tensor core used'] = item.tensorcore_used
gpu_stage_data['ratio'] = format_ratio(
item.gpu_time / total_gpu_time)
data['events'].append(gpu_stage_data)
else:
sorted_items = sorted(
self.kernel_items.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
results = []
for kernel_name, item in sorted_items:
if search_name in kernel_name:
results.append(kernel_name)
if group_by == 'kernel_name':
data['column_name'] = [
"name", "calls", "total_time", "avg_time", "max_time",
"min_time", "mean blocks per sm",
"mean est achieved occupancy", "tensor core used", "ratio"
]
for kernel_name in results:
item = self.kernel_items[kernel_name]
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = kernel_name
gpu_stage_data['calls'] = item.call
gpu_stage_data['total_time'] = format_time(
item.gpu_time, time_unit)
gpu_stage_data['avg_time'] = format_time(
item.avg_gpu_time, time_unit)
gpu_stage_data['max_time'] = format_time(
item.max_gpu_time, time_unit)
gpu_stage_data['min_time'] = format_time(
item.min_gpu_time, time_unit)
gpu_stage_data['mean blocks per sm'] = format_float(
item.sum_blocks_per_sm / item.call)
gpu_stage_data[
'mean est achieved occupancy'] = format_float(
item.sum_occupancy / item.call)
gpu_stage_data['tensor core used'] = item.tensorcore_used
gpu_stage_data['ratio'] = format_ratio(
item.gpu_time / total_gpu_time)
data['events'].append(gpu_stage_data)
else:
for kernel_name in results:
for items_with_attributes, item in self.kernel_items_with_op_name_attributes[
kernel_name].items():
operator, grid, block, register_per_thread, shared_memory = attributes.split(
'-')
gpu_stage_data = OrderedDict()
gpu_stage_data['name'] = kernel_name
gpu_stage_data['calls'] = item.call
gpu_stage_data['operator'] = operator
gpu_stage_data['grid'] = grid
gpu_stage_data['block'] = block
gpu_stage_data[
'register per thread'] = register_per_thread
gpu_stage_data['shared memory'] = shared_memory
gpu_stage_data['total_time'] = format_time(
item.gpu_time, time_unit)
gpu_stage_data['avg_time'] = format_time(
item.avg_gpu_time, time_unit)
gpu_stage_data['max_time'] = format_time(
item.max_gpu_time, time_unit)
gpu_stage_data['min_time'] = format_time(
item.min_gpu_time, time_unit)
gpu_stage_data['mean blocks per sm'] = format_float(
item.sum_blocks_per_sm / item.call)
gpu_stage_data[
'mean est achieved occupancy'] = format_float(
item.sum_occupancy / item.call)
gpu_stage_data[
'tensor core used'] = item.tensorcore_used
gpu_stage_data['ratio'] = format_ratio(
item.gpu_time / total_gpu_time)
data['events'].append(gpu_stage_data)
return data
def get_kernel_tc_pie(self, topk, time_unit='ms'):
data = OrderedDict()
data['column_name'] = ["name", "calls", "ratio"]
data['events'] = []
sorted_items = sorted(
self.kernel_items.items(),
key=lambda x: x[1].gpu_time,
reverse=True)
if topk <= 0:
items = sorted_items
else:
items = sorted_items[:topk]
total_calls = 0.0
tensorcore_calls = 0.0
for kernel_name, item in items:
if item.tensorcore_used:
tensorcore_calls += item.call
total_calls += item.call
data['events'].append({
"name":
"Tensor core used",
"calls":
tensorcore_calls,
"ratio":
format_ratio(tensorcore_calls / total_calls)
})
data['events'].append({
"name":
"Tensor core unused",
"calls":
total_calls - tensorcore_calls,
"ratio":
format_ratio((total_calls - tensorcore_calls) / total_calls)
})
return data
def get_trace_data(self):
return self.trace_parser.content
def get_memory_devices(self):
data = []
for device in self.memory_curve.keys():
data.append({
"device":
device,
"min_size":
format_memory(self.size_ranges[device][0], 'KB'),
"max_size":
format_memory(self.size_ranges[device][1], 'KB'),
"max_allocation_size":
format_memory(self.peak_allocation_values[device], 'KB'),
})
return data
def get_memory_curve(self, device_type, time_unit='ms'):
curves = self.memory_curve[device_type]
data = {}
data['name'] = {
'Allocated': '已分配',
'Reserved': '已预留',
'PeakAllocated': '最大已分配',
'PeakReserved': '最大已预留'
}
for key, events in curves.items():
data[key] = []
sorted_events = sorted(events, key=lambda x: x[0])
for item in sorted_events:
timestamp = item[0]
size = item[1]
event_name = item[2]
data[key].append([
format_time(timestamp, time_unit),
format_memory(size, 'KB'), event_name
])
return data
def get_memory_events(self,
device_type,
min_size=0,
max_size=float('inf'),
search_name=None,
time_unit='ms'):
data = {}
data['column_name'] = [
'MemoryAddr', 'MemoryType', 'AllocatedEvent', 'AllocatedTimestamp',
'FreeEvent', 'FreeTimestamp', 'Duration', 'Size'
]
data['data'] = []
paired_event_list = self.paired_events[device_type]
def filter_func(item):
nonlocal min_size
nonlocal max_size
nonlocal search_name
size = format_memory(item[-1], 'KB')
if not search_name:
if size >= min_size and size <= max_size:
return True
else:
if size >= min_size and size <= max_size:
if item[2]:
if search_name in item[2]:
return True
if item[4]:
if search_name in item[4]:
return True
return False
paired_event_list = filter(filter_func, paired_event_list)
paired_event_list = sorted(paired_event_list, key=lambda x: x[-1])
if not paired_event_list:
return data
duration = None
for item in paired_event_list:
if item[3] and item[5]:
duration = item[5] - item[3]
else:
duration = None
data['data'].append({
"MemoryAddr":
item[0],
"MemoryType":
item[1],
"AllocatedEvent":
item[2],
"AllocatedTimestamp":
format_time(item[3], time_unit) if item[3] else None,
"FreeEvent":
item[4],
"FreeTimestamp":
format_time(item[5], time_unit) if item[5] else None,
"Duration":
format_time(duration, time_unit)
if duration is not None else None,
"Size":
format_memory(item[6], 'KB')
})
return data
def get_op_memory_events(self, device_type, search_name=None):
data = {}
data['column_name'] = [
'EventName', 'MemoryType', 'AllocationCount', 'FreeCount',
'AllocationSize', 'FreeSize', 'IncreasedSize'
]
data['data'] = []
allocated_events = self.allocated_items[device_type]
def filter_func(item):
nonlocal search_name
if not search_name:
return True
else:
if search_name in item[0]:
return True
return False
reserved_events = self.reserved_items[device_type]
all_events = [(key, item) for key, item in allocated_events.items()]
all_events.extend(
[(key, item) for key, item in reserved_events.items()])
if search_name:
all_events = filter(filter_func, all_events)
sorted_items = sorted(
all_events, key=lambda x: x[1].increase_size, reverse=True)
if not sorted_items:
return data
for event_name, item in sorted_items:
data['data'].append({
'EventName':
event_name,
'MemoryType':
item.memory_type,
'AllocationCount':
item.allocation_count,
'FreeCount':
item.free_count,
'AllocationSize':
format_memory(item.allocation_size, 'KB'),
'FreeSize':
format_memory(item.free_size, 'KB'),
'IncreasedSize':
format_memory(item.increase_size, 'KB')
})
return data
class DistributedProfilerData:
'''
Hold data for distributed view.
Aggregate all data for distributed in ProfileData object.
'''
def __init__(self, run, span, profile_datas):
self.run = run
self.span = span
self.profile_datas = profile_datas
def get_distributed_info(self):
data = []
for profile_data in self.profile_datas:
device_infos = profile_data.device_infos
gpu_id = int(next(iter(profile_data.gpu_ids)))
data.append({
'worker_name':
profile_data.worker_name,
'process_id':
'pid: {}'.format(profile_data.process_id),
'device_id':
'GPU{}'.format(gpu_id),
'name':
device_infos[gpu_id]['name'],
'memory':
"{} GB".format(
format_memory(device_infos[gpu_id]['totalGlobalMem'],
'GB')),
'computeCapability':
'{}.{}'.format(device_infos[gpu_id]['computeMajor'],
device_infos[gpu_id]['computeMinor']),
'utilization':
'{}%'.format(format_ratio(profile_data.gpu_ulitization))
})
return data
def get_distributed_histogram(self, step, time_unit='ms'):
data = {}
data['order'] = [
"ProfileStep", "Communication", "Computation", "Overlap", "Others"
]
data['worker_name'] = []
data['data'] = []
new_data = defaultdict(list)
for profile_data in self.profile_datas:
data['worker_name'].append(profile_data.worker_name)
if step != 'All':
new_data['ProfileStep'].append(
format_time(
profile_data.model_perspective_items['ProfileStep'].
cpu_times[step], time_unit))
else:
new_data['ProfileStep'].append(
format_time(
profile_data.model_perspective_items['ProfileStep'].
cpu_time, time_unit))
new_data['Communication'].append(
format_time(
profile_data.distributed_time[step]['communication_time'],
time_unit))
new_data['Computation'].append(
format_time(
profile_data.distributed_time[step]['computation_time'],
time_unit))
new_data['Overlap'].append(
format_time(
profile_data.distributed_time[step]['overlap_time'],
time_unit))
new_data['Others'].append(
format_time(profile_data.distributed_time[step]['others_time'],
time_unit))
for order in data['order']:
data['data'].append(new_data[order])
return data
def get_distributed_steps(self):
for profile_data in self.profile_datas:
steps = list(profile_data.distributed_time.keys())
final_steps = ['All'] + sorted(
[int(step) for step in steps if step != 'All'])
return final_steps
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import os
import re
from threading import Thread
from multiprocess import Process
from multiprocess import Queue
from .parser.const_description import * # noqa: F403
from .parser.event_node import load_profiler_json
from .run_manager import RunManager
from visualdl.io import bfile
_name_pattern = re.compile(r"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))")
def is_VDLProfiler_file(path):
"""Determine whether it is a paddle profile file that can be read by vdl according to the file name.
File name of a paddle profile file must contain `paddle_trace`.
Args:
path: File name to determine.
Returns:
True if the file is a paddle profile file, otherwise false.
"""
if "paddle_trace" not in path:
return False
return True
class ProfilerReader(object):
"""Profile reader to read paddle profile files, support for frontend api in lib.py.
"""
def __init__(self, logdir=''):
"""Instance of ProfileReader
Args:
logdir: The dir include paddle profile files, multiple subfolders allowed.
"""
if isinstance(logdir, str):
self.dir = [logdir]
else:
self.dir = logdir
self.walks = {}
self.displayname2runs = {}
self.runs2displayname = {}
self.run_managers = {}
self.profile_result_queue = Queue()
self.tempfile = None
self.runs()
Thread(target=self._get_data_from_queue, args=()).start()
@property
def logdir(self):
return self.dir
def get_all_walk(self):
flush_walks = {}
for dir in self.dir:
for root, dirs, files in bfile.walk(dir):
flush_walks.update({root: files})
return flush_walks
def get_run_manager(self, run):
if run in self.run_managers:
self.run_managers[run].join()
return self.run_managers[run]
else:
return None
def profile_runs(self, update=False):
"""Get profile run files.
Every dir(means `run` in vdl) has may have more than one profiler file.
Returns:
walks: A dict like {"exp1": ["1587375595_paddle_trace.json", "1587375685_paddle_trace.json"],
"exp2": ["1587375686_paddle_trace.json"]}
"""
if not self.walks or update is True:
flush_walks = self.get_all_walk()
walks_temp = {}
for run, filenames in flush_walks.items():
tags_temp = [
filename for filename in filenames
if is_VDLProfiler_file(filename)
]
if len(tags_temp) > 0:
walks_temp.update({run: tags_temp})
self.walks = walks_temp
return self.walks
def runs(self, update=True):
self.profile_runs(update=update)
for run, filenames in self.walks.items():
if run not in self.run_managers:
self.run_managers[run] = RunManager(run)
self.run_managers[run].set_all_filenames(filenames)
for filename in filenames:
if self.run_managers[run].has_handled(filename):
continue
self._read_data(run, filename)
return list(self.walks.keys())
def get_descriptions(self, lang):
if lang == 'zh':
return {
"overview_environment": TOOLTIP_DEVICE_INFO_CN, # noqa: F405
"overview_model_perspective":
TOOLTIP_MODEL_PERSPECTIVE_CN, # noqa: F405
"overview_model_perspective_perstep":
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN, # noqa: F405
"overview_event_type_perspective":
TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN, # noqa: F405
"overview_event_type_model_perspective":
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN, # noqa: F405
"distributed_histogram":
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN # noqa: F405
}
else:
return {
"overview_environment": TOOLTIP_DEVICE_INFO_EN, # noqa: F405
"overview_model_perspective":
TOOLTIP_MODEL_PERSPECTIVE_EN, # noqa: F405
"overview_model_perspective_perstep":
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN, # noqa: F405
"overview_event_type_perspective":
TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN, # noqa: F405
"overview_event_type_model_perspective":
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN, # noqa: F405
"distributed_histogram":
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN # noqa: F405
}
def set_displayname(self, log_reader):
self.displayname2runs = log_reader.name2tags
self.runs2displayname = log_reader.tags2name
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def _get_data_from_queue(self):
while True:
try:
run, filename, worker_name, profile_result = self.profile_result_queue.get(
)
self.run_managers[run].add_profile_result(
filename, worker_name, profile_result)
except Exception as e:
print('Read profiler data error in multiprocess, error: {}'.
format(e))
def _read_data(self, run, filename):
match = _name_pattern.match(filename)
if match:
worker_name = match.group(1)
if '.pb' in filename:
try:
from paddle.profiler import load_profiler_result
except Exception:
print(
'Load paddle.profiler error. Please check paddle >= 2.3.0'
)
exit(0)
profile_result = load_profiler_result(
os.path.join(run, filename))
self.run_managers[run].add_profile_result(
filename, worker_name, profile_result)
else:
def _load_profiler_json(run, filename, worker_name):
profile_result = load_profiler_json(
os.path.join(run, filename))
self.profile_result_queue.put((run, filename, worker_name,
profile_result))
Process(
target=_load_profiler_json,
args=(run, filename, worker_name)).start()
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import json
from .profiler_reader import ProfilerReader
from visualdl.server.api import gen_result
from visualdl.server.api import result
class ProfilerApi(object):
def __init__(self, logdir):
self._reader = ProfilerReader(logdir)
@result()
def runs(self):
return self._reader.runs()
@result()
def views(self, run):
run_manager = self._reader.get_run_manager(run)
if run_manager is None:
return []
return list(run_manager.get_views())
@result()
def workers(self, run, view):
if view == 'Distributed':
return ['All']
run_manager = self._reader.get_run_manager(run)
return run_manager.get_workers(view)
@result()
def spans(self, run, worker):
run_manager = self._reader.get_run_manager(run)
if worker == 'All':
return run_manager.get_distributed_spans()
return run_manager.get_spans(worker)
@result()
def timeunits(self):
return ['ns', 'us', 'ms', 's']
@result()
def descriptions(self, lang):
if lang == 'undefined' or lang is None:
lang = 'zh'
lang = lang.lower()
return self._reader.get_descriptions(lang)
@result()
def overview_environment(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
span = str(span)
profiler_data = run_manager.get_profiler_data(worker, span)
result = profiler_data.get_device_infos()
num_workers = len(run_manager.get_workers('Overview'))
result['num_workers'] = num_workers
return result
@result()
def model_perspective(self, run, worker, span, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_model_perspective(time_unit)
@result()
def model_perspective_perstep(self,
run,
worker,
span,
device_type,
time_unit='ms'):
device_type = device_type.lower()
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_model_perspective_perstep(
device_type, time_unit)
@result()
def event_type_perspective(self,
run,
worker,
span,
device_type,
time_unit='ms'):
device_type = device_type.lower()
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_event_type_perspective(device_type, time_unit)
@result()
def event_type_model_perspective(self, run, worker, span, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_event_type_model_perspective(time_unit)
@result()
def userdefined_perspective(self, run, worker, span, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_userdefined_perspective(time_unit)
@result()
def operator_pie(self, run, worker, span, topk, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
topk = int(topk)
return profiler_data.get_operator_pie(topk, time_unit)
@result()
def operator_pie_expand(self, run, worker, span, topk, device_type,
time_unit):
device_type = device_type.lower()
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
topk = int(topk)
return profiler_data.get_operator_pie_expand(topk, device_type,
time_unit)
@result()
def operator_table(self,
run,
worker,
span,
group_by,
search_name,
time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_operator_table(group_by, search_name,
time_unit)
@result()
def operator_stack_table(self,
run,
worker,
span,
op_name,
group_by,
input_shape,
time_unit='ms'):
pass
@result()
def kernel_pie(self, run, worker, span, topk, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
topk = int(topk)
return profiler_data.get_kernel_pie(topk, time_unit)
@result()
def kernel_table(self,
run,
worker,
span,
group_by,
search_name,
time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_kernel_table(group_by, search_name, time_unit)
@result()
def kernel_tc_pie(self, run, worker, span, topk, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
topk = int(topk)
return profiler_data.get_kernel_tc_pie(topk, time_unit)
@result()
def distributed_info(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
if distributed_profiler_data is None:
return
return distributed_profiler_data.get_distributed_info()
@result()
def distributed_steps(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
return distributed_profiler_data.get_distributed_steps()
@result()
def distributed_histogram(self, run, worker, span, step, time_unit='ms'):
run_manager = self._reader.get_run_manager(run)
distributed_profiler_data = run_manager.get_distributed_profiler_data(
span)
return distributed_profiler_data.get_distributed_histogram(
step, time_unit)
@result(headers={'content-encoding': 'gzip'})
def trace(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_trace_data()
@result()
def memory_devices(self, run, worker, span):
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_memory_devices()
@result(headers={'content-encoding': 'gzip'})
def memory_curve(self, run, worker, span, device_type, time_unit='ms'):
if device_type == 'undefined':
return
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_memory_curve(device_type, time_unit)
@result(headers={'content-encoding': 'gzip'})
def memory_events(self,
run,
worker,
span,
device_type,
min_size=0,
max_size=float('inf'),
search_name=None,
time_unit='ms'):
if device_type == 'undefined':
return
try:
min_size = float(min_size)
except Exception:
min_size = 0
try:
max_size = float(max_size)
except Exception:
max_size = float('inf')
if search_name == 'undefined' or not search_name:
search_name = None
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_memory_events(device_type, min_size, max_size,
search_name, time_unit)
@result(headers={'content-encoding': 'gzip'})
def op_memory_events(self,
run,
worker,
span,
device_type,
search_name=None):
if search_name == 'undefined' or not search_name:
search_name = None
if device_type == 'undefined':
return
run_manager = self._reader.get_run_manager(run)
profiler_data = run_manager.get_profiler_data(worker, span)
return profiler_data.get_op_memory_events(device_type, search_name)
@result()
def comparison_phase(self, base_run, base_worker, base_span, exp_run,
exp_worker, exp_span):
pass
@result()
def comparison_phase_diff(self, base_run, base_worker, base_span, exp_run,
exp_worker, exp_span):
pass
@result()
def comparison_phase_table(self, base_run, base_worker, base_span, exp_run,
exp_worker, exp_span):
pass
@result()
def comparison_phase_inner(self, base_run, base_worker, base_span, exp_run,
exp_worker, exp_span, phase_name):
pass
@result()
def comparison_phase_diff_inner(self, base_run, base_worker, base_span,
exp_run, exp_worker, exp_span, phase_name):
pass
@result()
def comparison_phase_table_inner(self, base_run, base_worker, base_span,
exp_run, exp_worker, exp_span,
phase_name):
pass
def create_profiler_api_call(logdir):
api = ProfilerApi(logdir)
routes = {
'runs': (api.runs, []),
'views': (api.views, ["run"]),
'workers': (api.workers, ["run", "view"]),
'spans': (api.spans, ["run", "worker"]),
'timeunits': (api.timeunits, []),
'descriptions': (api.descriptions, ["lang"]),
'overview/environment': (api.overview_environment,
["run", "worker", "span"]),
'overview/model_perspective': (api.model_perspective,
["run", "worker", "span", "time_unit"]),
'overview/model_perspective_perstep': (api.model_perspective_perstep, [
"run", "worker", "span", "device_type", "time_unit"
]),
'overview/event_type_perspective': (api.event_type_perspective, [
"run", "worker", "span", "device_type", "time_unit"
]),
'overview/event_type_model_perspective':
(api.event_type_model_perspective,
["run", "worker", "span", "time_unit"]),
'overview/userdefined_perspective':
(api.userdefined_perspective, ["run", "worker", "span", "time_unit"]),
'operator/pie': (api.operator_pie,
["run", "worker", "span", "topk", "time_unit"]),
'operator/pie_expand': (api.operator_pie_expand, [
"run", "worker", "span", "topk", "device_type", "time_unit"
]),
'operator/table': (api.operator_table, [
"run", "worker", "span", "group_by", "search_name", "time_unit"
]),
'operator/stack_table': (api.operator_stack_table, [
"run", "worker", "span", "op_name", "group_by", "time_unit"
"input_shape"
]),
'kernel/pie': (api.kernel_pie,
["run", "worker", "span", "topk", "time_unit"]),
'kernel/tensorcore_pie':
(api.kernel_tc_pie, ["run", "worker", "span", "topk", "time_unit"]),
'kernel/table': (api.kernel_table, [
"run", "worker", "span", "group_by", "search_name", "time_unit"
]),
'distributed/info': (api.distributed_info, ["run", "worker", "span"]),
'distributed/steps': (api.distributed_steps, ["run", "worker",
"span"]),
'distributed/histogram': (api.distributed_histogram, [
"run", "worker", "span", "step", "time_unit"
]),
'trace': (api.trace, ["run", "worker", "span"]),
'memory/devices': (api.memory_devices, ["run", "worker", "span"]),
'memory/curve': (api.memory_curve,
["run", "worker", "span", "device_type",
"time_unit"]),
'memory/memory_events': (api.memory_events, [
"run", "worker", "span", "device_type", "min_size", "max_size",
"search_name", "time_unit"
]),
'memory/op_memory_events': (api.op_memory_events, [
"run", "worker", "span", "device_type", "search_name"
]),
'comparison/phase': (api.comparison_phase, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span"
]),
'comparison/phase_diff': (api.comparison_phase_diff, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span"
]),
'comparison/phase_table': (api.comparison_phase_table, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span"
]),
'comparison/phase_inner': (api.comparison_phase_inner, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span", "phase_name"
]),
'comparison/phase_diff_inner': (api.comparison_phase_diff_inner, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span", "phase_name"
]),
'comparison/phase_table_inner': (api.comparison_phase_table_inner, [
"base_run", "base_worker", "base_span", "exp_run", "exp_worker",
"exp_span", "phase_name"
])
}
def call(path: str, args):
route = routes.get(path)
if not route:
return json.dumps(gen_result(
status=1, msg='api not found')), 'application/json', None
method, call_arg_names = route
call_args = [args.get(name) for name in call_arg_names]
return method(*call_args)
return call
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from collections import defaultdict
from threading import Thread
from .profiler_data import DistributedProfilerData
from .profiler_data import ProfilerData
class RunManager:
'''
Manage profile data for each run, each run may have multiple workers and spans.
We should manage profile data of each (worker, span) unit.
Besides, a special worker "all" is created to merge all profile data for distributed view.
'''
def __init__(self, run):
self.run = run
# worker:
# span:
# ProfileData
self.profiler_data = defaultdict(dict)
self.all_filenames = set()
self.handled_filenames = set()
# span:
# DistributedProfileData
self.distributed_data = {}
self.threads = {}
self.has_join = False
def get_profiler_data(self, worker, span):
if worker in self.profiler_data:
if span in self.profiler_data[worker]:
return self.profiler_data[worker][span]
def get_distributed_profiler_data(self, span):
if span in self.distributed_data:
return self.distributed_data[span]
def get_views(self):
'''
Return all views supported in current run data.
'''
all_views = set()
for worker, span_data in self.profiler_data.items():
for span, profiler_data in span_data.items():
all_views.update(profiler_data.get_views())
ordered_views = [
'Overview', 'Operator', 'GPU Kernel', 'Distributed', 'Trace',
'Memory'
]
final_views = []
for view in ordered_views:
if view in all_views:
final_views.append(view)
return final_views
def get_workers(self, view_name):
'''
Return all workers(processes) in current run data.
'''
workers = []
for worker, span_data in self.profiler_data.items():
for span, profiler_data in span_data.items():
if view_name in profiler_data.get_views():
workers.append(worker)
break
return workers
def get_spans(self, worker_name):
'''
Return all spans in current run data.
spans: Collecting profile data when training your model can be divided into several parts supported by\
paddle.profiler api, for example, you may profile steps 2-4, 6-8. Each range is called a span here. \
And We index each span by orders.
'''
spans = list(self.profiler_data[worker_name].keys())
spans = sorted([int(span) for span in spans])
spans = [str(span) for span in spans]
return spans
def get_distributed_spans(self):
spans = list(self.distributed_data.keys())
spans = sorted([int(span) for span in spans])
spans = [str(span) for span in spans]
return spans
def _parse_file(self, worker_name, result):
span = result.get_span_idx()
self.profiler_data[worker_name][span] = ProfilerData(
self.run, worker_name, span, result)
return
def join(self):
if self.has_join:
return
for thread in self.threads.values():
thread.join()
self.has_join = True
distributed_profiler_data = defaultdict(list)
for worker_name, span_data in self.profiler_data.items():
for span_idx, profiler_data in span_data.items():
distributed_profiler_data[span_idx].append(profiler_data)
for span_idx, profiler_datas in distributed_profiler_data.items():
self.distributed_data[span_idx] = DistributedProfilerData(
self.run, span_idx, profiler_datas)
def add_profile_result(self, filename, worker_name, profile_result):
thread = Thread(
target=self._parse_file, args=(worker_name, profile_result))
thread.start()
self.handled_filenames.add(filename)
self.threads[filename] = thread
def set_all_filenames(self, filenames):
self.all_filenames.update(filenames)
def has_handled(self, filename):
if filename in self.handled_filenames:
return True
else:
return False
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
# limitations under the License. # limitations under the License.
# ======================================================================= # =======================================================================
import functools import functools
import gzip
import json import json
import os import os
from io import BytesIO
from flask import request from flask import request
...@@ -52,6 +54,13 @@ def result(mimetype='application/json', headers=None): ...@@ -52,6 +54,13 @@ def result(mimetype='application/json', headers=None):
headers_output = headers(self) headers_output = headers(self)
else: else:
headers_output = headers headers_output = headers
if headers is not None:
if 'content-encoding' in headers:
buf = BytesIO()
with gzip.GzipFile(mode='wb', fileobj=buf) as fp:
gzip_value = data.encode()
fp.write(gzip_value)
data = buf.getvalue()
return data, mimetype, headers_output return data, mimetype, headers_output
return wrapper return wrapper
......
...@@ -32,6 +32,7 @@ from flask_babel import Babel ...@@ -32,6 +32,7 @@ from flask_babel import Babel
import visualdl.server import visualdl.server
from visualdl import __version__ from visualdl import __version__
from visualdl.component.profiler.profiler_server import create_profiler_api_call
from visualdl.server.api import create_api_call from visualdl.server.api import create_api_call
from visualdl.server.args import parse_args from visualdl.server.args import parse_args
from visualdl.server.args import ParseArgs from visualdl.server.args import ParseArgs
...@@ -52,7 +53,7 @@ mock_data_path = os.path.join(SERVER_DIR, "./mock_data/") ...@@ -52,7 +53,7 @@ mock_data_path = os.path.join(SERVER_DIR, "./mock_data/")
check_live_path = '/alive' check_live_path = '/alive'
def create_app(args): def create_app(args): # noqa: C901
# disable warning from flask # disable warning from flask
cli = sys.modules['flask.cli'] cli = sys.modules['flask.cli']
cli.show_server_banner = lambda *x: None cli.show_server_banner = lambda *x: None
...@@ -66,7 +67,7 @@ def create_app(args): ...@@ -66,7 +67,7 @@ def create_app(args):
app.config['BABEL_DEFAULT_LOCALE'] = default_language app.config['BABEL_DEFAULT_LOCALE'] = default_language
babel = Babel(app) babel = Babel(app)
api_call = create_api_call(args.logdir, args.model, args.cache_timeout) api_call = create_api_call(args.logdir, args.model, args.cache_timeout)
profiler_api_call = create_profiler_api_call(args.logdir)
if args.telemetry: if args.telemetry:
update_util.PbUpdater(args.product).start() update_util.PbUpdater(args.product).start()
...@@ -134,6 +135,12 @@ def create_app(args): ...@@ -134,6 +135,12 @@ def create_app(args):
return make_response( return make_response(
Response(data, mimetype=mimetype, headers=headers)) Response(data, mimetype=mimetype, headers=headers))
@app.route(api_path + '/profiler/<path:method>', methods=["GET", "POST"])
def serve_profiler_api(method):
data, mimetype, headers = profiler_api_call(method, request.args)
return make_response(
Response(data, mimetype=mimetype, headers=headers))
@app.route(check_live_path) @app.route(check_live_path)
def check_live(): def check_live():
return '', 204 return '', 204
......
...@@ -13,4 +13,4 @@ ...@@ -13,4 +13,4 @@
# limitations under the License. # limitations under the License.
# ======================================================================= # =======================================================================
vdl_version = '2.3.0' vdl_version = '2.4.0'
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册