Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
VisualDL
提交
d251028d
V
VisualDL
项目概览
PaddlePaddle
/
VisualDL
大约 2 年 前同步成功
通知
89
Star
4655
Fork
642
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
10
列表
看板
标记
里程碑
合并请求
2
Wiki
5
Wiki
分析
仓库
DevOps
项目成员
Pages
V
VisualDL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
10
Issue
10
列表
看板
标记
里程碑
合并请求
2
合并请求
2
Pages
分析
分析
仓库分析
DevOps
Wiki
5
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d251028d
编写于
8月 23, 2022
作者:
C
chenjian
提交者:
GitHub
8月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add profiler backend
上级
8b1815f3
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
4848 addition
and
54 deletion
+4848
-54
visualdl/component/base_component.py
visualdl/component/base_component.py
+55
-51
visualdl/component/profiler/__init__.py
visualdl/component/profiler/__init__.py
+14
-0
visualdl/component/profiler/parser/__init__.py
visualdl/component/profiler/parser/__init__.py
+14
-0
visualdl/component/profiler/parser/const_description.py
visualdl/component/profiler/parser/const_description.py
+159
-0
visualdl/component/profiler/parser/distributed_parser.py
visualdl/component/profiler/parser/distributed_parser.py
+163
-0
visualdl/component/profiler/parser/event_node.py
visualdl/component/profiler/parser/event_node.py
+434
-0
visualdl/component/profiler/parser/kernel_parser.py
visualdl/component/profiler/parser/kernel_parser.py
+175
-0
visualdl/component/profiler/parser/memory_parser.py
visualdl/component/profiler/parser/memory_parser.py
+180
-0
visualdl/component/profiler/parser/operator_parser.py
visualdl/component/profiler/parser/operator_parser.py
+133
-0
visualdl/component/profiler/parser/overview_parser.py
visualdl/component/profiler/parser/overview_parser.py
+456
-0
visualdl/component/profiler/parser/trace_parser.py
visualdl/component/profiler/parser/trace_parser.py
+20
-0
visualdl/component/profiler/parser/utils.py
visualdl/component/profiler/parser/utils.py
+494
-0
visualdl/component/profiler/profiler_data.py
visualdl/component/profiler/profiler_data.py
+1802
-0
visualdl/component/profiler/profiler_reader.py
visualdl/component/profiler/profiler_reader.py
+199
-0
visualdl/component/profiler/profiler_server.py
visualdl/component/profiler/profiler_server.py
+397
-0
visualdl/component/profiler/run_manager.py
visualdl/component/profiler/run_manager.py
+134
-0
visualdl/server/api.py
visualdl/server/api.py
+9
-0
visualdl/server/app.py
visualdl/server/app.py
+9
-2
visualdl/version.py
visualdl/version.py
+1
-1
未找到文件。
visualdl/component/base_component.py
浏览文件 @
d251028d
...
...
@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from
visualdl.proto.record_pb2
import
Record
import
numpy
as
np
from
PIL
import
Image
from
visualdl.proto.record_pb2
import
Record
def
scalar
(
tag
,
value
,
step
,
walltime
=
None
):
"""Package data to one scalar.
Args:
tag (string): Data identifier
value (float): Value of scalar
...
...
@@ -52,8 +51,7 @@ def meta_data(tag='meta_data_tag', display_name="", step=0, walltime=None):
"""
meta
=
Record
.
MetaData
(
display_name
=
display_name
)
return
Record
(
values
=
[
Record
.
Value
(
id
=
step
,
tag
=
tag
,
timestamp
=
walltime
,
meta_data
=
meta
)
Record
.
Value
(
id
=
step
,
tag
=
tag
,
timestamp
=
walltime
,
meta_data
=
meta
)
])
...
...
@@ -82,8 +80,8 @@ def imgarray2bytes(np_array):
def
make_grid
(
I
,
ncols
=
8
):
# noqa: E741
assert
isinstance
(
I
,
np
.
ndarray
),
'plugin error, should pass numpy array here'
assert
isinstance
(
I
,
np
.
ndarray
),
'plugin error, should pass numpy array here'
if
I
.
shape
[
1
]
==
1
:
I
=
np
.
concatenate
([
I
,
I
,
I
],
1
)
# noqa: E741
assert
I
.
ndim
==
4
and
I
.
shape
[
1
]
==
3
or
I
.
shape
[
1
]
==
4
...
...
@@ -113,9 +111,11 @@ def convert_to_HWC(tensor, input_format):
Return:
Image of format `HWC`.
"""
assert
(
len
(
set
(
input_format
))
==
len
(
input_format
)),
"You can not use the same dimension shordhand twice.
\
assert
(
len
(
set
(
input_format
))
==
len
(
input_format
)
),
"You can not use the same dimension shordhand twice.
\
input_format: {}"
.
format
(
input_format
)
assert
(
len
(
tensor
.
shape
)
==
len
(
input_format
)),
"size of input tensor and input format are different.
\
assert
(
len
(
tensor
.
shape
)
==
len
(
input_format
)
),
"size of input tensor and input format are different.
\
tensor shape: {}, input_format: {}"
.
format
(
tensor
.
shape
,
input_format
)
input_format
=
input_format
.
upper
()
...
...
@@ -129,7 +129,8 @@ def convert_to_HWC(tensor, input_format):
index
=
[
input_format
.
find
(
c
)
for
c
in
'HWC'
]
tensor_HWC
=
tensor
.
transpose
(
index
)
if
tensor_HWC
.
shape
[
2
]
==
1
:
tensor_HWC
=
np
.
concatenate
([
tensor_HWC
,
tensor_HWC
,
tensor_HWC
],
2
)
tensor_HWC
=
np
.
concatenate
([
tensor_HWC
,
tensor_HWC
,
tensor_HWC
],
2
)
return
tensor_HWC
if
len
(
input_format
)
==
2
:
...
...
@@ -202,7 +203,8 @@ def embedding(tag, labels, hot_vectors, step, labels_meta=None, walltime=None):
for
label
,
hot_vector
in
zip
(
labels
,
hot_vectors
):
if
not
isinstance
(
label
,
list
):
label
=
[
label
]
embeddings
.
embeddings
.
append
(
Record
.
Embedding
(
label
=
label
,
vectors
=
hot_vector
))
embeddings
.
embeddings
.
append
(
Record
.
Embedding
(
label
=
label
,
vectors
=
hot_vector
))
return
Record
(
values
=
[
Record
.
Value
(
...
...
@@ -325,7 +327,8 @@ def hparam(name, hparam_dict, metric_list, walltime):
hparamInfo
.
string_value
=
v
hm
.
hparamInfos
.
append
(
hparamInfo
)
else
:
print
(
"The value of %s must be int, float or str, not %s"
%
(
k
,
str
(
type
(
v
))))
print
(
"The value of %s must be int, float or str, not %s"
%
(
k
,
str
(
type
(
v
))))
for
metric
in
metric_list
:
metricInfo
=
Record
.
HParam
.
HparamInfo
()
metricInfo
.
name
=
metric
...
...
@@ -333,8 +336,7 @@ def hparam(name, hparam_dict, metric_list, walltime):
hm
.
metricInfos
.
append
(
metricInfo
)
return
Record
(
values
=
[
Record
.
Value
(
id
=
1
,
tag
=
"hparam"
,
timestamp
=
walltime
,
hparam
=
hm
)
Record
.
Value
(
id
=
1
,
tag
=
"hparam"
,
timestamp
=
walltime
,
hparam
=
hm
)
])
...
...
@@ -389,7 +391,12 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
return
data
def
pr_curve
(
tag
,
labels
,
predictions
,
step
,
walltime
,
num_thresholds
=
127
,
def
pr_curve
(
tag
,
labels
,
predictions
,
step
,
walltime
,
num_thresholds
=
127
,
weights
=
None
):
"""Package data to one pr_curve.
...
...
@@ -409,7 +416,8 @@ def pr_curve(tag, labels, predictions, step, walltime, num_thresholds=127,
num_thresholds
=
min
(
num_thresholds
,
127
)
prcurve_map
=
compute_curve
(
labels
,
predictions
,
num_thresholds
,
weights
)
return
pr_curve_raw
(
tag
=
tag
,
return
pr_curve_raw
(
tag
=
tag
,
tp
=
prcurve_map
[
'tp'
],
fp
=
prcurve_map
[
'fp'
],
tn
=
prcurve_map
[
'tn'
],
...
...
@@ -441,7 +449,6 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
Return:
Package with format of record_pb2.Record
"""
"""
if isinstance(tp, np.ndarray):
tp = tp.astype(int).tolist()
...
...
@@ -456,15 +463,10 @@ def pr_curve_raw(tag, tp, fp, tn, fn, precision, recall, step, walltime):
if isinstance(recall, np.ndarray):
recall = recall.astype(int).tolist()
"""
prcurve
=
Record
.
PRCurve
(
TP
=
tp
,
FP
=
fp
,
TN
=
tn
,
FN
=
fn
,
precision
=
precision
,
recall
=
recall
)
prcurve
=
Record
.
PRCurve
(
TP
=
tp
,
FP
=
fp
,
TN
=
tn
,
FN
=
fn
,
precision
=
precision
,
recall
=
recall
)
return
Record
(
values
=
[
Record
.
Value
(
id
=
step
,
tag
=
tag
,
timestamp
=
walltime
,
pr_curve
=
prcurve
)
Record
.
Value
(
id
=
step
,
tag
=
tag
,
timestamp
=
walltime
,
pr_curve
=
prcurve
)
])
...
...
@@ -518,7 +520,13 @@ def compute_roc_curve(labels, predictions, num_thresholds=None, weights=None):
return
data
def
roc_curve
(
tag
,
labels
,
predictions
,
step
,
walltime
,
num_thresholds
=
127
,
weights
=
None
):
def
roc_curve
(
tag
,
labels
,
predictions
,
step
,
walltime
,
num_thresholds
=
127
,
weights
=
None
):
"""Package data to one roc_curve.
Args:
tag (string): Data identifier
...
...
@@ -533,9 +541,11 @@ def roc_curve(tag, labels, predictions, step, walltime, num_thresholds=127, weig
Package with format of record_pb2.Record
"""
num_thresholds
=
min
(
num_thresholds
,
127
)
roc_curve_map
=
compute_roc_curve
(
labels
,
predictions
,
num_thresholds
,
weights
)
roc_curve_map
=
compute_roc_curve
(
labels
,
predictions
,
num_thresholds
,
weights
)
return
roc_curve_raw
(
tag
=
tag
,
return
roc_curve_raw
(
tag
=
tag
,
tp
=
roc_curve_map
[
'tp'
],
fp
=
roc_curve_map
[
'fp'
],
tn
=
roc_curve_map
[
'tn'
],
...
...
@@ -563,7 +573,6 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
Return:
Package with format of record_pb2.Record
"""
"""
if isinstance(tp, np.ndarray):
tp = tp.astype(int).tolist()
...
...
@@ -578,12 +587,7 @@ def roc_curve_raw(tag, tp, fp, tn, fn, tpr, fpr, step, walltime):
if isinstance(fpr, np.ndarray):
fpr = fpr.astype(int).tolist()
"""
roc_curve
=
Record
.
ROC_Curve
(
TP
=
tp
,
FP
=
fp
,
TN
=
tn
,
FN
=
fn
,
tpr
=
tpr
,
fpr
=
fpr
)
roc_curve
=
Record
.
ROC_Curve
(
TP
=
tp
,
FP
=
fp
,
TN
=
tn
,
FN
=
fn
,
tpr
=
tpr
,
fpr
=
fpr
)
return
Record
(
values
=
[
Record
.
Value
(
id
=
step
,
tag
=
tag
,
timestamp
=
walltime
,
roc_curve
=
roc_curve
)
...
...
visualdl/component/profiler/__init__.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
visualdl/component/profiler/parser/__init__.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
visualdl/component/profiler/parser/const_description.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
__ALL__
=
[
'TOOLTIP_DEVICE_INFO_CN'
,
'TOOLTIP_MODEL_PERSPECTIVE_CN'
,
'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN'
,
'TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN'
,
'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN'
,
'TOOLTIP_DEVICE_INFO_EN'
,
'TOOLTIP_MODEL_PERSPECTIVE_EN'
,
'TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN'
,
'TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN'
,
'TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN'
]
TOOLTIP_DEVICE_INFO_CN
=
\
'<b class="bold">CPU进程利用率:</b><br>'
\
'进程所利用到的CPU的时间 / ProfileStep的时间(即性能分析的时间跨度)<br>'
\
'<b class="bold">CPU系统利用率:</b><br>'
\
'整个系统所有进程利用到的CPU时间 / CPU总时间(ProfileStep的时间*CPU核心数)<br>'
\
'<b class="bold">GPU利用率:</b><br>'
\
'进程利用GPU计算的时间 / ProfileStep的时间,进程利用GPU计算的时间即是GPU Kernel计算的时间,越高越好<br>'
\
'<b class="bold">流处理器效率:</b><br>'
\
'对于流处理器处理某个GPU Kernel, 其效率为SM_Eff_i = min(Kernel所用的Blocks数量 / GPU的流处理器数量, 100%)。'
\
'流处理器效率为SM_Eff_i关于每个Kernel的执行时间加权和 / ProfileStep的时间<br>'
\
'<b class="bold">流处理器占用率:</b><br>'
\
'对于流处理器处理某个GPU Kernel, 其占用率Occu_i = 为活跃的warp数 / 能支持的最大warp数。流处理器占用率为Occu_i关于每个Kernel执行时间的加权平均<br>'
\
'<b class="bold">Tensor cores使用时间占比:</b><br>'
\
'使用Tensor Cores的GPU Kernel的计算时间 / 所有Kernel的计算时间<br>'
TOOLTIP_MODEL_PERSPECTIVE_CN
=
\
'展示模型各阶段DataLoader, Forward, Backward, Optimization以及Other的总CPU和GPU时间。<br>'
\
'CPU时间即是各阶段代码执行的时间,GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'
\
'<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'
\
'<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'
\
'<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'
\
'<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'
\
'<b class="bold">Other:</b> 其它时间<br>'
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN
=
\
'展示每一个ProfileStep内模型各阶段DataLoader, Forward, Backward, Optimization以及Other的CPU和GPU时间。<br>'
\
'CPU时间即是各阶段代码执行的时间,GPU时间是各阶段所调用的GPU Kernel在GPU上的计算时间。<br>'
\
'<b class="bold">DataLoader:</b> 表示使用paddle.io.DataLoader从数据集中取数据的阶段<br>'
\
'<b class="bold">Forward:</b> 表示模型前向计算的阶段<br>'
\
'<b class="bold">Backward:</b> 表示模型反向梯度计算的阶段<br>'
\
'<b class="bold">Optimization:</b> 表示模型优化更新参数的阶段<br>'
\
'<b class="bold">Other:</b> 其它时间<br>'
TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN
=
\
'展示不同类型的事件在模型各阶段DataLoader, Forward, Backward, Optimization以及Other的分布。<br>'
\
'<b class="bold">Operator:</b> 表示框架内的算子执行<br>'
\
'<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'
\
'<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'
\
'<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'
\
'<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'
\
'<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'
\
'<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'
\
'<b class="bold">Communication:</b> 表示分布式通信有关的事件<br>'
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN
=
\
'展示在模型各阶段DataLoader, Forward, Backward, Optimization以及Other所包含的各种事件的时间。<br>'
\
'<b class="bold">Operator:</b> 表示框架内的算子执行<br>'
\
'<b class="bold">CudaRuntime:</b> 表示cuda runtime的函数执行<br>'
\
'<b class="bold">Kernel:</b> 表示GPU上计算的Kernel函数执行<br>'
\
'<b class="bold">Memcpy:</b> 表示CPU和GPU之间的数据传输<br>'
\
'<b class="bold">Memset:</b> 表示GPU的显存值设置<br>'
\
'<b class="bold">UserDefined:</b> 表示用户在python脚本中自定义的事件<br>'
\
'<b class="bold">OperatorInner:</b> 表示框架内算子的执行子过程<br>'
\
'<b class="bold">Communication:</b> 表示分布式通信有关的数据通信和计算事件<br>'
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN
=
\
'展示模型在每个迭代过程中通信、计算以及两者重叠部分的时间。<br>'
\
'<b class="bold">ProfileStep:</b> 表示某一步迭代的总时间<br>'
\
'<b class="bold">Communication:</b> 表示和通信相关的时间,包括框架内打的Communication事件、和通信有关的算子和Kernel(nccl)执行的时间<br>'
\
'<b class="bold">Computation:</b> 表示GPU Kernel计算的时间,但是去除了和通信有关的Kernel(nccl)<br>'
\
'<b class="bold">Overlap:</b> 表示通信和计算过程并行执行时候时间相互重叠的部分<br>'
\
'<b class="bold">Others:</b> 表示通信和计算之外的时间<br>'
TOOLTIP_DEVICE_INFO_EN
=
\
'<b class="bold">CPU Process Utilization:</b><br>'
\
'Process CPU time / ProfileStep time(total time of profiling)<br>'
\
'<b class="bold">CPU System Utilization:</b><br>'
\
'Sum of system
\'
s all processes CPU time/ CPU total time(ProfileStep time* #CPU Core)<br>'
\
'<b class="bold">GPU Utilization:</b><br>'
\
'GPU busy time / ProfileStep time,GPU busy time is the time during in which at least one GPU kernel is
\
running on it.<br>'
\
'<b class="bold">Est. SM Efficiency:</b><br>'
\
'The SM efficiency for one kernel can be denoted as SM_Eff_i = min(blocks of this kernel / SM number
\
of this GPU, 100%).'
\
'Est. SM efficiency of GPU is the weighted sum of SM_Eff_i across all kernels / ProfileStep time<br>'
\
'<b class="bold">Est. Achieved Occupancy:</b><br>'
\
'The SM occupancy for one kernel can be denoted as Occu_i = active warps on an SM / maximum number
\
of active warps supported by the SM.
\
Est. SM occupancy of GPU is the weighted average of Occu_i across all kernels<br>'
\
'<b class="bold">Tensor cores ratio:</b><br>'
\
'Sum of kernel time using Tensor Cores / Sum of total kernel time<br>'
TOOLTIP_MODEL_PERSPECTIVE_EN
=
\
'Present CPU and GPU time for each stage of a model, i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'
\
'CPU time is the execution time for code,GPU time is the calculation time of kernels launched in the stage.<br>'
\
'<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'
\
'<b class="bold">Forward:</b> denote model forward<br>'
\
'<b class="bold">Backward:</b> denote gradient back-propagate<br>'
\
'<b class="bold">Optimization:</b> denote parameters update<br>'
\
'<b class="bold">Other:</b> other time out of above range'
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN
=
\
'Present CPU and GPU time in each ProfileStep for each stage of a model,
\
i.e. DataLoader, Forward, Backward, Optimization and Other.<br>'
\
'CPU time is the execution time for code,GPU time is the calculation time of kernels launched in the stage.<br>'
\
'<b class="bold">DataLoader:</b> denote data fetching using paddle.io.DataLoader<br>'
\
'<b class="bold">Forward:</b> denote model forward<br>'
\
'<b class="bold">Backward:</b> denote gradient back-propagate<br>'
\
'<b class="bold">Optimization:</b> denote parameters update<br>'
\
'<b class="bold">Other:</b> other time out of above range'
TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN
=
\
'Present the distribution of each kind of events across DataLoader,
\
Forward, Backward, Optimization and Other stage.<br>'
\
'<b class="bold">Operator:</b> denote operator execution<br>'
\
'<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'
\
'<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'
\
'<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'
\
'<b class="bold">Memset:</b> denote memory data set on GPU<br>'
\
'<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'
\
'<b class="bold">OperatorInner:</b> denote operator
\'
s subprocess execution<br>'
\
'<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN
=
\
'Present the time of each kind of events included in DataLoader, Forward, Backward, Optimization
\
and Other stage.<br>'
\
'<b class="bold">Operator:</b> denote operator execution<br>'
\
'<b class="bold">CudaRuntime:</b> denote cuda runtime function execution<br>'
\
'<b class="bold">Kernel:</b> denote kernel execution on GPU<br>'
\
'<b class="bold">Memcpy:</b> denote data transfer between CPU and GPU<br>'
\
'<b class="bold">Memset:</b> denote memory data set on GPU<br>'
\
'<b class="bold">UserDefined:</b> denote events defined by users in python script<br>'
\
'<b class="bold">OperatorInner:</b> denote operator
\'
s subprocess execution<br>'
\
'<b class="bold">Communication:</b> denote events associated with distributed data transfer and computation.<br>'
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN
=
\
'Present the time of communication, computation and their overlap in program.<br>'
\
'<b class="bold">ProfileStep:</b> denote an iteration step of training process<br>'
\
'<b class="bold">Communication:</b> denote the time related to communication, including events of communication type
\
in paddle framework、communication-related operators and GPU Kernels(nccl)<br>'
\
'<b class="bold">Computation:</b> denote the computation
\
time of GPU Kernels,except communication-related Kernels(nccl)<br>'
\
'<b class="bold">Overlap:</b> denote the overlap time between Communication and
\
Computation when they are executed parallelly.<br>'
\
'<b class="bold">Others:</b> denote the time out of Communication and Computation<br>'
visualdl/component/profiler/parser/distributed_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from
collections
import
defaultdict
from
.utils
import
get_device_nodes
from
.utils
import
intersection_ranges
from
.utils
import
merge_ranges
from
.utils
import
merge_self_ranges
from
.utils
import
rebuild_node_trees
from
.utils
import
sum_ranges
from
.utils
import
traverse_tree
_CommunicationOpName
=
[
'allreduce'
,
'broadcast'
,
'rpc'
]
class
DistributedParser
:
r
"""
Analysis communication and computation time range, and their overlap.
The computation time is all kernel except kernels for communication like nccl.
"""
def
__init__
(
self
):
self
.
steps_data
=
defaultdict
(
lambda
:
defaultdict
(
list
))
self
.
calls
=
defaultdict
(
lambda
:
defaultdict
(
int
))
self
.
steps_time
=
defaultdict
(
lambda
:
defaultdict
(
float
))
self
.
profile_steps_time
=
{}
def
parse
(
self
,
nodetrees
):
'''
Collect all communication and computation time ranges.
'''
total_time
=
0.0
nodetrees
=
rebuild_node_trees
(
nodetrees
)
thread2hostnodes
=
traverse_tree
(
nodetrees
)
thread_count
=
0
for
threadid
,
hostnodes
in
thread2hostnodes
.
items
():
for
hostnode
in
hostnodes
[
1
:]:
# skip root node
# case 1: TracerEventType is Communication
if
hostnode
.
type
==
'ProfileStep'
:
if
thread_count
==
0
:
total_time
+=
(
hostnode
.
end_ns
-
hostnode
.
start_ns
)
self
.
_parse_step
(
hostnode
)
continue
thread_count
+=
1
new_steps_data
=
defaultdict
(
lambda
:
defaultdict
(
list
))
self
.
profile_steps_time
[
'All'
]
=
total_time
for
step
,
step_data
in
self
.
steps_data
.
items
():
self
.
calls
[
step
][
'cpu_communication_range'
]
=
len
(
step_data
[
'cpu_communication_range'
])
self
.
calls
[
step
][
'gpu_communication_range'
]
=
len
(
step_data
[
'gpu_communication_range'
])
new_steps_data
[
step
][
'cpu_communication_range'
]
=
merge_self_ranges
(
step_data
[
'cpu_communication_range'
],
is_sorted
=
False
)
new_steps_data
[
step
][
'gpu_communication_range'
]
=
merge_self_ranges
(
step_data
[
'gpu_communication_range'
],
is_sorted
=
False
)
new_steps_data
[
step
][
'communication_range'
]
=
merge_ranges
(
new_steps_data
[
step
][
'cpu_communication_range'
],
new_steps_data
[
step
][
'gpu_communication_range'
],
is_sorted
=
True
)
new_steps_data
[
step
][
'computation_range'
]
=
merge_self_ranges
(
step_data
[
'computation_range'
],
is_sorted
=
False
)
new_steps_data
[
step
][
'overlap_range'
]
=
intersection_ranges
(
new_steps_data
[
step
][
'communication_range'
],
new_steps_data
[
step
][
'computation_range'
],
is_sorted
=
True
)
self
.
steps_time
[
step
][
'communication_time'
]
=
sum_ranges
(
new_steps_data
[
step
][
'communication_range'
])
self
.
steps_time
[
step
][
'computation_time'
]
=
sum_ranges
(
new_steps_data
[
step
][
'computation_range'
])
self
.
steps_time
[
step
][
'overlap_time'
]
=
sum_ranges
(
new_steps_data
[
step
][
'overlap_range'
])
self
.
steps_time
[
step
][
'others_time'
]
=
self
.
profile_steps_time
[
step
]
-
self
.
steps_time
[
step
][
'communication_time'
]
-
self
.
steps_time
[
step
][
'computation_time'
]
+
self
.
steps_time
[
step
][
'overlap_time'
]
self
.
steps_data
=
new_steps_data
def
_parse_step
(
self
,
profile_step_node
):
step
=
profile_step_node
.
name
.
split
(
'#'
)[
1
]
self
.
profile_steps_time
[
step
]
=
profile_step_node
.
end_ns
-
profile_step_node
.
start_ns
nodes
=
[]
stack
=
[]
stack
.
append
(
profile_step_node
)
while
stack
:
current_node
=
stack
.
pop
()
nodes
.
append
(
current_node
)
for
childnode
in
current_node
.
children_node
:
stack
.
append
(
childnode
)
for
hostnode
in
nodes
:
if
hostnode
.
type
==
'Communication'
:
self
.
steps_data
[
step
][
'cpu_communication_range'
].
append
(
(
hostnode
.
start_ns
,
hostnode
.
end_ns
))
self
.
steps_data
[
'All'
][
'cpu_communication_range'
].
append
(
(
hostnode
.
start_ns
,
hostnode
.
end_ns
))
device_nodes
=
get_device_nodes
(
hostnode
)
for
device_node
in
device_nodes
:
if
device_node
.
type
==
'Kernel'
:
self
.
steps_data
[
step
][
'gpu_communication_range'
].
append
(
(
device_node
.
start_ns
,
device_node
.
end_ns
))
self
.
steps_data
[
'All'
][
'gpu_communication_range'
].
append
(
(
device_node
.
start_ns
,
device_node
.
end_ns
))
# case 2: TracerEventType is Operator but is communication op
elif
hostnode
.
type
==
'Operator'
and
any
([
name
in
hostnode
.
name
.
lower
()
for
name
in
_CommunicationOpName
]):
self
.
steps_data
[
step
][
'cpu_communication_range'
].
append
(
(
hostnode
.
start_ns
,
hostnode
.
end_ns
))
self
.
steps_data
[
'All'
][
'cpu_communication_range'
].
append
(
(
hostnode
.
start_ns
,
hostnode
.
end_ns
))
device_nodes
=
get_device_nodes
(
hostnode
)
for
device_node
in
device_nodes
:
if
device_node
.
type
==
'Kernel'
:
self
.
steps_data
[
step
][
'gpu_communication_range'
].
append
(
(
device_node
.
start_ns
,
device_node
.
end_ns
))
self
.
steps_data
[
'All'
][
'gpu_communication_range'
].
append
(
(
device_node
.
start_ns
,
device_node
.
end_ns
))
# case 3: Others, filter kernels named with nccl
else
:
for
runtimenode
in
hostnode
.
runtime_node
:
for
devicenode
in
runtimenode
.
device_node
:
if
devicenode
.
type
==
'Kernel'
:
if
'nccl'
in
devicenode
.
name
.
lower
():
self
.
steps_data
[
step
][
'gpu_communication_range'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
self
.
steps_data
[
'All'
][
'gpu_communication_range'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
else
:
self
.
steps_data
[
step
][
'computation_range'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
self
.
steps_data
[
'All'
][
'computation_range'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
visualdl/component/profiler/parser/event_node.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
collections
import
functools
import
json
import
re
import
sys
_show_name_pattern
=
re
.
compile
(
r
'(.+)(\[.+\])'
)
_show_tid_pattern
=
re
.
compile
(
r
'\w+(\(.+\))'
)
host_node_type_map
=
{
"Operator"
,
"Dataloader"
,
"ProfileStep"
,
"CudaRuntime"
,
"UserDefined"
,
"OperatorInner"
,
"Forward"
,
"Backward"
,
"Optimization"
,
"Communication"
,
"PythonOp"
,
"PythonUserDefined"
}
device_node_type_map
=
{
"Kernel"
,
"Memcpy"
,
"Memset"
}
memory_node_event_map
=
{
"Allocate"
,
"Free"
,
"ReservedAllocate"
,
"ReservedFree"
}
class
HostNode
:
def
__init__
(
self
):
self
.
name
=
None
self
.
type
=
None
self
.
start_ns
=
0
self
.
end_ns
=
0
self
.
process_id
=
0
self
.
thread_id
=
0
self
.
correlation_id
=
-
1
self
.
input_shapes
=
{}
self
.
dtypes
=
{}
self
.
callstack
=
""
self
.
children_node
=
[]
self
.
runtime_node
=
[]
self
.
device_node
=
[]
self
.
mem_node
=
[]
@
classmethod
def
from_json
(
cls
,
json_obj
):
self
=
cls
()
self
.
name
=
json_obj
[
'name'
].
replace
(
_show_name_pattern
.
match
(
json_obj
[
'name'
]).
group
(
2
),
""
)
self
.
type
=
json_obj
[
'cat'
]
self
.
start_ns
=
int
(
float
(
json_obj
[
'args'
][
'start_time'
].
split
(
' '
)[
0
])
*
1000
)
self
.
end_ns
=
int
(
float
(
json_obj
[
'args'
][
'end_time'
].
split
(
' '
)[
0
])
*
1000
)
self
.
process_id
=
json_obj
[
'pid'
]
self
.
thread_id
=
json_obj
[
'tid'
].
replace
(
_show_tid_pattern
.
match
(
json_obj
[
'tid'
]).
group
(
1
),
""
)
self
.
correlation_id
=
json_obj
[
'args'
][
'correlation id'
]
if
'correlation id'
in
json_obj
[
'args'
]
else
-
1
self
.
input_shapes
=
json_obj
[
'args'
][
'input_shapes'
]
if
'input_shapes'
in
json_obj
[
'args'
]
else
{}
self
.
dtypes
=
json_obj
[
'args'
][
'input_dtypes'
]
if
'input_dtypes'
in
json_obj
[
'args'
]
else
{}
self
.
callstack
=
json_obj
[
'args'
][
'callstack'
]
if
'callstack'
in
json_obj
[
'args'
]
else
""
self
.
children_node
=
[]
self
.
runtime_node
=
[]
self
.
device_node
=
[]
self
.
mem_node
=
[]
return
self
class
MemNode
:
def
__init__
(
self
):
self
.
type
=
None
self
.
timestamp_ns
=
0
self
.
addr
=
0
self
.
process_id
=
0
self
.
thread_id
=
0
self
.
increase_bytes
=
0
self
.
place
=
None
self
.
current_allocated
=
0
self
.
current_reserved
=
0
self
.
peak_allocated
=
0
self
.
peak_reserved
=
0
@
classmethod
def
from_json
(
cls
,
json_obj
):
self
=
cls
()
self
.
type
=
json_obj
[
'cat'
]
self
.
timestamp_ns
=
json_obj
[
'ts'
]
*
1000
self
.
addr
=
hex
(
int
(
json_obj
[
'args'
][
'addr'
]))
if
'addr'
in
json_obj
[
'args'
]
else
0
self
.
process_id
=
json_obj
[
'pid'
]
self
.
thread_id
=
json_obj
[
'tid'
].
replace
(
_show_tid_pattern
.
match
(
json_obj
[
'tid'
]).
group
(
1
),
""
)
self
.
increase_bytes
=
json_obj
[
'args'
][
'increase_bytes'
]
if
'increase_bytes'
in
json_obj
[
'args'
]
else
0
self
.
place
=
json_obj
[
'args'
][
'place'
]
if
'place'
in
json_obj
[
'args'
]
else
"Place(cpu)"
self
.
current_allocated
=
json_obj
[
'args'
][
'current_allocated'
]
if
'current_allocated'
in
json_obj
[
'args'
]
else
0
self
.
current_reserved
=
json_obj
[
'args'
][
'current_reserved'
]
if
'current_reserved'
in
json_obj
[
'args'
]
else
0
self
.
peak_allocated
=
json_obj
[
'args'
][
'peak_allocated'
]
if
'peak_allocated'
in
json_obj
[
'args'
]
else
0
self
.
peak_reserved
=
json_obj
[
'args'
][
'peak_reserved'
]
if
'peak_reserved'
in
json_obj
[
'args'
]
else
0
return
self
class
DeviceNode
:
def
__init__
(
self
):
self
.
name
=
None
self
.
type
=
None
self
.
start_ns
=
0
self
.
end_ns
=
0
self
.
device_id
=
0
self
.
stream_id
=
0
self
.
context_id
=
0
self
.
correlation_id
=
0
self
.
block_x
,
self
.
block_y
,
self
.
block_z
=
[
0
,
0
,
0
]
self
.
grid_x
,
self
.
grid_y
,
self
.
grid_z
=
[
0
,
0
,
0
]
self
.
shared_memory
=
0
self
.
registers_per_thread
=
0
self
.
num_bytes
=
0
self
.
value
=
0
self
.
occupancy
=
0
self
.
blocks_per_sm
=
0
self
.
warps_per_sm
=
0
@
classmethod
def
from_json
(
cls
,
json_obj
):
self
=
cls
()
self
.
name
=
json_obj
[
'name'
].
replace
(
_show_name_pattern
.
match
(
json_obj
[
'name'
]).
group
(
2
),
""
)
self
.
type
=
json_obj
[
'cat'
]
self
.
start_ns
=
int
(
float
(
json_obj
[
'args'
][
'start_time'
].
split
(
' '
)[
0
])
*
1000
)
self
.
end_ns
=
int
(
float
(
json_obj
[
'args'
][
'end_time'
].
split
(
' '
)[
0
])
*
1000
)
self
.
device_id
=
json_obj
[
'pid'
]
self
.
stream_id
=
json_obj
[
'tid'
]
self
.
context_id
=
json_obj
[
'args'
][
'context'
]
if
'context'
in
json_obj
[
'args'
]
else
0
self
.
correlation_id
=
json_obj
[
'args'
][
'correlation id'
]
self
.
block_x
,
self
.
block_y
,
self
.
block_z
=
json_obj
[
'args'
][
'block'
]
if
'block'
in
json_obj
[
'args'
]
else
[
0
,
0
,
0
]
self
.
grid_x
,
self
.
grid_y
,
self
.
grid_z
=
json_obj
[
'args'
][
'grid'
]
if
'grid'
in
json_obj
[
'args'
]
else
[
0
,
0
,
0
]
self
.
shared_memory
=
json_obj
[
'args'
][
'shared memory'
]
if
'shared memory'
in
json_obj
[
'args'
]
else
0
self
.
registers_per_thread
=
json_obj
[
'args'
][
'registers per thread'
]
if
'registers per thread'
in
json_obj
[
'args'
]
else
0
self
.
num_bytes
=
json_obj
[
'args'
][
'bytes'
]
if
'bytes'
in
json_obj
[
'args'
]
else
0
self
.
value
=
json_obj
[
'args'
][
'value'
]
if
'value'
in
json_obj
[
'args'
]
else
0
self
.
occupancy
=
json_obj
[
'args'
][
'theoretical achieved occupancy %'
]
if
'theoretical achieved occupancy %'
in
json_obj
[
'args'
]
else
0
self
.
blocks_per_sm
=
json_obj
[
'args'
][
"blocks per SM"
]
if
"blocks per SM"
in
json_obj
[
'args'
]
else
0
self
.
warps_per_sm
=
json_obj
[
'args'
][
"warps per SM"
]
if
"warps per SM"
in
json_obj
[
'args'
]
else
0
return
self
class
ProfilerResult
:
def
__init__
(
self
,
json_data
):
self
.
device_infos
=
None
self
.
span_idx
=
None
self
.
data
=
None
self
.
extra_info
=
None
self
.
schema_version
=
None
self
.
has_hostnodes
=
True
self
.
has_devicenodes
=
True
self
.
has_memnodes
=
True
self
.
parse
(
json_data
)
self
.
content
=
json_data
self
.
start_in_timeline_ns
=
None
def
parse
(
self
,
json_data
):
self
.
schema_version
=
json_data
[
'schemaVersion'
]
self
.
span_idx
=
json_data
[
'span_indx'
]
self
.
device_infos
=
{
device_info
[
'id'
]:
device_info
for
device_info
in
json_data
[
'deviceProperties'
]
}
hostnodes
=
[]
runtimenodes
=
[]
devicenodes
=
[]
memnodes
=
[]
for
event
in
json_data
[
'traceEvents'
]:
if
not
event
or
(
event
[
'ph'
]
!=
'X'
and
event
[
'ph'
]
!=
'i'
):
continue
if
event
[
'cat'
]
in
host_node_type_map
:
if
event
[
'cat'
]
==
'CudaRuntime'
or
event
[
'cat'
]
==
'MluRuntime'
:
runtimenodes
.
append
(
HostNode
.
from_json
(
event
))
else
:
hostnodes
.
append
(
HostNode
.
from_json
(
event
))
if
hostnodes
[
-
1
].
start_ns
==
0
:
self
.
start_in_timeline_ns
=
int
(
event
[
'ts'
])
*
1000
elif
event
[
'cat'
]
in
device_node_type_map
:
devicenodes
.
append
(
DeviceNode
.
from_json
(
event
))
elif
event
[
'cat'
]
in
memory_node_event_map
:
memnodes
.
append
(
MemNode
.
from_json
(
event
))
if
memnodes
:
for
memnode
in
memnodes
:
assert
self
.
start_in_timeline_ns
is
not
None
memnode
.
timestamp_ns
=
memnode
.
timestamp_ns
-
self
.
start_in_timeline_ns
if
not
hostnodes
:
self
.
has_hostnodes
=
False
if
not
devicenodes
:
self
.
has_devicenodes
=
False
if
not
memnodes
:
self
.
has_memnodes
=
False
self
.
data
=
self
.
build_tree
(
hostnodes
,
runtimenodes
,
devicenodes
,
memnodes
)
self
.
extra_info
=
json_data
[
'ExtraInfo'
]
def
build_tree
(
# noqa: C901
self
,
hostnodes
,
runtimenodes
,
devicenodes
,
memnodes
):
thread2host_event_nodes
=
collections
.
defaultdict
(
list
)
thread2runtime_event_nodes
=
collections
.
defaultdict
(
list
)
thread2mem_event_nodes
=
collections
.
defaultdict
(
list
)
correlation_id2runtime_event_node
=
{}
thread_event_trees
=
{}
thread_ids
=
set
()
for
hostnode
in
hostnodes
:
thread2host_event_nodes
[
hostnode
.
thread_id
].
append
(
hostnode
)
thread_ids
.
add
(
hostnode
.
thread_id
)
# construct thread2runtime_event_nodes and correlation_id2runtime_event_node
for
runtimenode
in
runtimenodes
:
thread2runtime_event_nodes
[
runtimenode
.
thread_id
].
append
(
runtimenode
)
thread_ids
.
add
(
runtimenode
.
thread_id
)
correlation_id2runtime_event_node
[
runtimenode
.
correlation_id
]
=
runtimenode
# associate CudaRuntimeTraceEventNode and DeviceTraceEventNode
# construct correlation_id2device_event_nodes
for
devicenode
in
devicenodes
:
if
devicenode
.
correlation_id
not
in
correlation_id2runtime_event_node
:
continue
runtimenode
=
correlation_id2runtime_event_node
[
devicenode
.
correlation_id
]
runtimenode
.
device_node
.
append
(
devicenode
)
# construct thread2mem_event_nodes
for
memnode
in
memnodes
:
thread2mem_event_nodes
[
memnode
.
thread_id
].
append
(
memnode
)
# sort host event nodes and runtime event nodes according to start_ns and
# end_ns
# the smaller start_ns is, the further ahead position is.
# when start_ns of two nodes are equal, the one with bigger end_ns should be
# ahead.
def
compare_hostnode_func
(
hostnode1
,
hostnode2
):
if
hostnode1
.
start_ns
<
hostnode2
.
start_ns
:
return
-
1
if
hostnode1
.
start_ns
==
hostnode2
.
start_ns
:
if
hostnode1
.
end_ns
>
hostnode2
.
end_ns
:
return
-
1
return
1
def
compare_memnode_func
(
memnode1
,
memnode2
):
if
memnode1
.
timestamp_ns
<=
memnode2
.
timestamp_ns
:
return
-
1
return
1
for
threadid
,
hostnodes
in
thread2host_event_nodes
.
items
():
thread2host_event_nodes
[
threadid
]
=
sorted
(
hostnodes
,
key
=
functools
.
cmp_to_key
(
compare_hostnode_func
))
for
threadid
,
runtimenodes
in
thread2runtime_event_nodes
.
items
():
thread2runtime_event_nodes
[
threadid
]
=
sorted
(
runtimenodes
,
key
=
functools
.
cmp_to_key
(
compare_hostnode_func
))
for
threadid
,
memnodes
in
thread2mem_event_nodes
.
items
():
thread2mem_event_nodes
[
threadid
]
=
sorted
(
memnodes
,
key
=
functools
.
cmp_to_key
(
compare_memnode_func
))
# construct trees
for
threadid
in
thread_ids
:
thread_event_trees
[
threadid
]
=
self
.
_build_tree_relationship
(
thread2host_event_nodes
[
threadid
],
thread2runtime_event_nodes
[
threadid
],
thread2mem_event_nodes
[
threadid
])
return
thread_event_trees
def
_build_tree_relationship
(
# noqa: C901
self
,
host_event_nodes
,
runtime_event_nodes
,
mem_event_nodes
):
# root node
root_node
=
HostNode
()
root_node
.
name
,
root_node
.
type
,
root_node
.
start_ns
,
root_node
.
end_ns
=
"root node"
,
"UserDefined"
,
\
0
,
sys
.
maxsize
# push root node into node_stack
node_stack
=
[]
node_stack
.
append
(
root_node
)
# handle host_event_nodes
for
host_node
in
host_event_nodes
:
while
True
:
stack_top_node
=
node_stack
[
-
1
]
if
host_node
.
start_ns
<
stack_top_node
.
end_ns
:
stack_top_node
.
children_node
.
append
(
host_node
)
node_stack
.
append
(
host_node
)
break
else
:
node_stack
.
pop
()
# insert runtime node
# select runtime nodes which time range within stack_top_node
hasenter
=
False
firstposition
=
0
lastposition
=
len
(
runtime_event_nodes
)
for
i
,
runtimenode
in
enumerate
(
runtime_event_nodes
):
if
runtimenode
.
start_ns
>=
stack_top_node
.
start_ns
and
\
runtimenode
.
end_ns
<=
stack_top_node
.
end_ns
:
if
not
hasenter
:
firstposition
=
i
hasenter
=
True
stack_top_node
.
runtime_node
.
append
(
runtimenode
)
else
:
# from this runtime node, not within stack_top_node, erase the
# nodes from runtime_event_nodes
if
runtimenode
.
start_ns
>
stack_top_node
.
end_ns
:
lastposition
=
i
break
if
hasenter
:
del
runtime_event_nodes
[
firstposition
:
lastposition
]
# to insert left runtimenode into host_event_nodes
while
node_stack
:
stack_top_node
=
node_stack
.
pop
()
# insert runtime node
# select runtime nodes which time range within stack_top_node
firstposition
=
0
lastposition
=
len
(
runtime_event_nodes
)
hasenter
=
False
for
i
,
runtimenode
in
enumerate
(
runtime_event_nodes
):
if
runtimenode
.
start_ns
>=
stack_top_node
.
start_ns
and
runtimenode
.
end_ns
<=
stack_top_node
.
end_ns
:
if
not
hasenter
:
firstposition
=
i
hasenter
=
True
stack_top_node
.
runtime_node
.
append
(
runtimenode
)
else
:
# from this runtime node, not within stack_top_node, erase the
# nodes from runtime_event_nodes
if
runtimenode
.
start_ns
>
stack_top_node
.
end_ns
:
lastposition
=
i
break
if
hasenter
:
del
runtime_event_nodes
[
firstposition
:
lastposition
]
# build relationship between host event node and mem event node
# First, post-order traverse the tree. Then, insert the memory and op
# supplement node into correct host nodes.
stack
=
[]
flag_stack
=
[]
post_order_nodes
=
[]
stack
.
append
(
root_node
)
flag_stack
.
append
(
0
)
while
stack
:
current_node
=
stack
.
pop
()
flag
=
flag_stack
.
pop
()
if
flag
==
0
:
stack
.
append
(
current_node
)
flag_stack
.
append
(
1
)
for
child
in
current_node
.
children_node
[::
-
1
]:
stack
.
append
(
child
)
flag_stack
.
append
(
0
)
else
:
post_order_nodes
.
append
(
current_node
)
for
node
in
post_order_nodes
:
hasenter
=
False
firstposition
=
0
lastposition
=
len
(
mem_event_nodes
)
for
i
,
mem_node
in
enumerate
(
mem_event_nodes
):
if
mem_node
.
timestamp_ns
>=
node
.
start_ns
and
mem_node
.
timestamp_ns
<=
node
.
end_ns
:
node
.
mem_node
.
append
(
mem_node
)
if
not
hasenter
:
firstposition
=
i
hasenter
=
True
else
:
if
mem_node
.
timestamp_ns
>
node
.
end_ns
:
lastposition
=
i
break
if
hasenter
:
del
mem_event_nodes
[
firstposition
:
lastposition
]
return
root_node
def
get_data
(
self
):
return
self
.
data
def
get_extra_info
(
self
):
return
self
.
extra_info
def
get_schema_version
(
self
):
return
self
.
schema_version
def
get_device_infos
(
self
):
return
self
.
device_infos
def
get_span_idx
(
self
):
return
self
.
span_idx
def
has_device
(
self
):
return
self
.
has_devicenodes
def
has_host
(
self
):
return
self
.
has_hostnodes
def
has_memory
(
self
):
return
self
.
has_memnodes
def
save
(
self
,
path
,
format
):
pass
def
load_profiler_json
(
file_name
):
content
=
json
.
load
(
open
(
file_name
,
'r'
))
return
ProfilerResult
(
content
)
visualdl/component/profiler/parser/kernel_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
collections
class
TC_Allowlist
(
dict
):
# Refer to https://github.com/NVIDIA/PyProf/blob/fd1b2902e3306119eee40ba6b6e8b2f816920c29/pyprof/prof/tc.py#L19
allowlist
=
[
'h884'
,
's884'
,
'h1688'
,
's1688'
,
'hmma'
,
'i8816'
,
'16816'
,
'dgrad_1x1_stride_2x2'
,
'first_layer_wgrad_kernel'
,
'conv1x1'
,
'conv2d_c1_k1'
,
'direct_group'
,
'xmma_implicit_gemm'
,
'xmma_sparse_conv'
,
'xmma_warp_specialized_implicit_gemm'
,
'xmma_gemm'
,
'xmma_sparse_gemm'
,
'c1688'
]
def
__init__
(
self
):
pass
def
__contains__
(
self
,
item
):
# If kernel name contains substring equal to any one in allowlist, then it uses tensor core.
for
pattern
in
self
.
allowlist
:
if
pattern
in
item
:
return
True
return
False
_allow_list
=
TC_Allowlist
()
class
DeviceItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
tensorcore_used
=
True
if
name
in
_allow_list
else
False
self
.
sum_blocks_per_sm
=
0.0
self
.
sum_occupancy
=
0.0
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_item
(
self
,
node
):
self
.
call
+=
1
self
.
add_gpu_time
(
node
.
end_ns
-
node
.
start_ns
)
self
.
sum_blocks_per_sm
+=
node
.
blocks_per_sm
self
.
sum_occupancy
+=
node
.
occupancy
class
KernelParser
:
def
__init__
(
self
):
self
.
kernel_items
=
{}
# for kernel summary
self
.
kernel_items_with_op_name_attributes
=
collections
.
defaultdict
(
dict
)
self
.
gpu_ids
=
set
()
self
.
occupancy
=
0.0
self
.
sm_efficiency
=
0.0
self
.
tensor_core_ratio
=
0.0
def
parse
(
self
,
nodelists
):
# noqa: C901
total_duration
=
0.0
weighted_occupancy
=
0.0
weighted_sm_efficiency
=
0.0
for
threadid
,
nodes
in
nodelists
.
items
():
for
node
in
nodes
:
if
node
.
type
==
'Operator'
:
op_name
=
node
.
name
for
children
in
node
.
children_node
:
if
children
.
type
==
'OperatorInner'
:
for
runtime_node
in
children
.
runtime_node
:
for
device_node
in
runtime_node
.
device_node
:
if
device_node
.
type
==
'Kernel'
:
op_attribute_name
=
self
.
_translate_op_name_attributes_to_string
(
op_name
,
device_node
)
if
op_attribute_name
not
in
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
]:
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
]
=
DeviceItem
(
device_node
.
name
)
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
].
add_item
(
device_node
)
for
runtime_node
in
node
.
runtime_node
:
for
device_node
in
runtime_node
.
device_node
:
if
device_node
.
type
==
'Kernel'
:
op_attribute_name
=
self
.
_translate_op_name_attributes_to_string
(
op_name
,
device_node
)
if
op_attribute_name
not
in
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
]:
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
]
=
DeviceItem
(
device_node
.
name
)
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
].
add_item
(
device_node
)
elif
node
.
type
==
'OperatorInner'
:
continue
op_name
=
node
.
name
for
runtime_node
in
node
.
runtime_node
:
for
device_node
in
runtime_node
.
device_node
:
if
device_node
.
type
==
'Kernel'
:
op_attribute_name
=
self
.
_translate_op_name_attributes_to_string
(
op_name
,
device_node
)
if
op_attribute_name
not
in
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
]:
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
]
=
DeviceItem
(
device_node
.
name
)
self
.
kernel_items_with_op_name_attributes
[
device_node
.
name
][
op_attribute_name
].
add_item
(
device_node
)
for
threadid
,
nodes
in
nodelists
.
items
():
for
node
in
nodes
:
for
runtime_node
in
node
.
runtime_node
:
for
device_node
in
runtime_node
.
device_node
:
if
device_node
.
type
==
'Kernel'
:
name
=
device_node
.
name
if
name
not
in
self
.
kernel_items
:
self
.
kernel_items
[
name
]
=
DeviceItem
(
name
)
self
.
kernel_items
[
name
].
add_item
(
device_node
)
weighted_occupancy
+=
(
device_node
.
occupancy
/
100
)
*
(
device_node
.
end_ns
-
device_node
.
start_ns
)
if
device_node
.
blocks_per_sm
>
1
:
sm_efficiency
=
1
else
:
sm_efficiency
=
device_node
.
blocks_per_sm
weighted_sm_efficiency
+=
sm_efficiency
*
(
device_node
.
end_ns
-
device_node
.
start_ns
)
total_duration
+=
(
device_node
.
end_ns
-
device_node
.
start_ns
)
self
.
gpu_ids
.
add
(
device_node
.
device_id
)
self
.
occupancy
=
weighted_occupancy
/
total_duration
if
total_duration
!=
0
else
0.0
self
.
sm_efficiency
=
weighted_sm_efficiency
# to divide ProfileStep time in ProfileData
total_time
=
0
total_tensorcore_time
=
0
for
name
,
node
in
self
.
kernel_items
.
items
():
if
node
.
tensorcore_used
:
total_tensorcore_time
+=
node
.
gpu_time
total_time
+=
node
.
gpu_time
self
.
tensor_core_ratio
=
total_tensorcore_time
/
total_time
if
total_time
!=
0
else
0.0
def
_translate_op_name_attributes_to_string
(
self
,
op_name
,
event
):
result
=
'{}-[{},{},{}]-[{},{},{}]-{}-{}'
.
format
(
op_name
,
event
.
grid_x
,
event
.
grid_y
,
event
.
grid_z
,
event
.
block_x
,
event
.
block_y
,
event
.
block_z
,
event
.
registers_per_thread
,
event
.
shared_memory
)
return
result
visualdl/component/profiler/parser/memory_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
collections
from
.utils
import
traverse_tree
class
MemoryItem
:
def
__init__
(
self
,
event_name
,
place
,
memory_type
=
'Allocated'
):
self
.
event_name
=
event_name
self
.
place
=
place
self
.
allocation_count
=
0
self
.
free_count
=
0
self
.
allocation_size
=
0
self
.
free_size
=
0
self
.
increase_size
=
0
self
.
memory_type
=
memory_type
def
add_memory_record
(
self
,
size
,
allocation_type
):
if
allocation_type
==
'Allocate'
or
allocation_type
==
'ReservedAllocate'
:
self
.
allocation_count
+=
1
self
.
allocation_size
+=
size
elif
allocation_type
==
'Free'
or
allocation_type
==
'ReservedFree'
:
self
.
free_count
+=
1
self
.
free_size
-=
size
# size is sign(-) when free.
else
:
print
(
"No corresponding type."
)
self
.
increase_size
=
self
.
allocation_size
-
self
.
free_size
class
MemoryParser
:
def
__init__
(
self
):
self
.
allocated_items
=
collections
.
defaultdict
(
dict
)
# for memory summary, device type: event
self
.
reserved_items
=
collections
.
defaultdict
(
dict
)
# for memory summary, device type: event
self
.
peak_allocation_values
=
collections
.
defaultdict
(
int
)
self
.
peak_reserved_values
=
collections
.
defaultdict
(
int
)
self
.
memory_events
=
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
list
))
self
.
memory_curve
=
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
list
))
self
.
paired_events
=
collections
.
defaultdict
(
list
)
self
.
size_ranges
=
{}
def
parse
(
self
,
nodetrees
):
# noqa: C901
r
"""
Analyse memory event in the nodetress.
"""
thread2hostnodes
=
traverse_tree
(
nodetrees
)
for
threadid
,
host_nodes
in
thread2hostnodes
.
items
():
for
host_node
in
host_nodes
[
1
:]:
# skip root node
if
host_node
.
type
==
'OperatorInner'
:
continue
if
host_node
.
type
==
'Operator'
:
for
child
in
host_node
.
children_node
:
self
.
_analyse_node_memory
(
host_node
.
name
,
child
)
self
.
_analyse_node_memory
(
host_node
.
name
,
host_node
)
# pair for memory events
for
device_type
,
memory_events
in
self
.
memory_events
.
items
():
max_size
=
0
for
(
addr
,
memory_type
),
memory_lists
in
memory_events
.
items
():
memory_lists
=
sorted
(
memory_lists
,
key
=
lambda
x
:
x
[
0
])
paired_results
=
[]
for
memory_list
in
memory_lists
:
timestamp
,
memory_type
,
hostnodename
,
size
=
memory_list
if
memory_type
==
'Allocate'
or
memory_type
==
'ReservedAllocate'
:
if
size
>
max_size
:
max_size
=
size
if
memory_type
==
'Allocate'
:
paired_results
.
append
([
addr
,
'Allocated'
,
hostnodename
,
timestamp
,
None
,
None
,
size
])
else
:
paired_results
.
append
([
addr
,
'ReservedAllocate'
,
hostnodename
,
timestamp
,
None
,
None
,
size
])
elif
memory_type
==
'Free'
or
memory_type
==
'ReservedFree'
:
if
-
size
>
max_size
:
max_size
=
-
size
if
paired_results
:
if
paired_results
[
-
1
][
-
3
]
is
None
:
paired_results
[
-
1
][
-
3
]
=
hostnodename
paired_results
[
-
1
][
-
2
]
=
timestamp
self
.
paired_events
[
device_type
].
append
(
paired_results
.
pop
())
else
:
if
memory_type
==
'Free'
:
paired_results
.
append
([
addr
,
'Allocated'
,
None
,
None
,
hostnodename
,
timestamp
,
-
size
])
else
:
paired_results
.
append
([
addr
,
'ReservedAllocate'
,
None
,
None
,
hostnodename
,
timestamp
,
-
size
])
self
.
paired_events
[
device_type
].
append
(
paired_results
.
pop
())
else
:
if
memory_type
==
'Free'
:
paired_results
.
append
([
addr
,
'Allocated'
,
None
,
None
,
hostnodename
,
timestamp
,
-
size
])
else
:
paired_results
.
append
([
addr
,
'ReservedAllocate'
,
None
,
None
,
hostnodename
,
timestamp
,
-
size
])
self
.
paired_events
[
device_type
].
append
(
paired_results
.
pop
())
self
.
paired_events
[
device_type
].
extend
(
paired_results
)
self
.
size_ranges
[
device_type
]
=
(
0
,
max_size
)
def
_analyse_node_memory
(
self
,
event_name
,
node
):
for
memnode
in
node
.
mem_node
:
# self mem node
if
memnode
.
type
==
'Allocate'
or
memnode
.
type
==
'Free'
:
if
event_name
not
in
self
.
allocated_items
[
memnode
.
place
]:
self
.
allocated_items
[
memnode
.
place
][
event_name
]
=
MemoryItem
(
event_name
,
memnode
.
place
,
'Allocated'
)
self
.
allocated_items
[
memnode
.
place
][
event_name
].
add_memory_record
(
memnode
.
increase_bytes
,
memnode
.
type
)
self
.
memory_events
[
memnode
.
place
][(
memnode
.
addr
,
'Allocated'
)].
append
([
memnode
.
timestamp_ns
,
memnode
.
type
,
event_name
,
memnode
.
increase_bytes
])
elif
memnode
.
type
==
'ReservedAllocate'
or
memnode
.
type
==
'ReservedFree'
:
if
event_name
not
in
self
.
reserved_items
[
memnode
.
place
]:
self
.
reserved_items
[
memnode
.
place
][
event_name
]
=
MemoryItem
(
event_name
,
memnode
.
place
,
'Reserved'
)
self
.
reserved_items
[
memnode
.
place
][
event_name
].
add_memory_record
(
memnode
.
increase_bytes
,
memnode
.
type
)
self
.
memory_events
[
memnode
.
place
][(
memnode
.
addr
,
"Reserved"
)].
append
([
memnode
.
timestamp_ns
,
memnode
.
type
,
event_name
,
memnode
.
increase_bytes
])
self
.
memory_curve
[
memnode
.
place
][
'Allocated'
].
append
(
(
memnode
.
timestamp_ns
,
memnode
.
current_allocated
,
event_name
))
self
.
memory_curve
[
memnode
.
place
][
'Reserved'
].
append
(
(
memnode
.
timestamp_ns
,
memnode
.
current_reserved
,
event_name
))
self
.
memory_curve
[
memnode
.
place
][
'PeakAllocated'
].
append
(
(
memnode
.
timestamp_ns
,
memnode
.
peak_allocated
,
event_name
))
self
.
memory_curve
[
memnode
.
place
][
'PeakReserved'
].
append
(
(
memnode
.
timestamp_ns
,
memnode
.
peak_reserved
,
event_name
))
self
.
peak_allocation_values
[
memnode
.
place
]
=
max
(
self
.
peak_allocation_values
[
memnode
.
place
],
memnode
.
peak_allocated
)
self
.
peak_reserved_values
[
memnode
.
place
]
=
max
(
self
.
peak_reserved_values
[
memnode
.
place
],
memnode
.
peak_reserved
)
visualdl/component/profiler/parser/operator_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
collections
from
.kernel_parser
import
DeviceItem
from
.utils
import
wrap_tree
class
OperatorItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
cpu_time
=
0
self
.
gpu_time
=
0
self
.
max_cpu_time
=
0
self
.
min_cpu_time
=
float
(
'inf'
)
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
devices
=
{}
self
.
operator_inners
=
{}
self
.
general_gpu_time
=
0
self
.
min_general_gpu_time
=
float
(
'inf'
)
self
.
max_general_gpu_time
=
0
@
property
def
avg_cpu_time
(
self
):
return
self
.
cpu_time
/
self
.
call
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
@
property
def
avg_general_gpu_time
(
self
):
return
self
.
general_gpu_time
/
self
.
call
def
add_cpu_time
(
self
,
time
):
if
time
>
self
.
max_cpu_time
:
self
.
max_cpu_time
=
time
if
time
<
self
.
min_cpu_time
:
self
.
min_cpu_time
=
time
self
.
cpu_time
+=
time
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_general_gpu_time
(
self
,
time
):
if
time
>
self
.
max_general_gpu_time
:
self
.
max_general_gpu_time
=
time
if
time
<
self
.
min_general_gpu_time
:
self
.
min_general_gpu_time
=
time
self
.
general_gpu_time
+=
time
def
add_call
(
self
):
self
.
call
+=
1
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
self
.
add_gpu_time
(
node
.
gpu_time
)
self
.
add_general_gpu_time
(
node
.
general_gpu_time
)
for
child
in
node
.
children_node
:
if
child
.
type
!=
'Operator'
:
if
child
.
name
not
in
self
.
operator_inners
:
self
.
operator_inners
[
child
.
name
]
=
OperatorItem
(
child
.
name
)
self
.
operator_inners
[
child
.
name
].
add_item
(
child
)
for
runtimenode
in
node
.
runtime_node
:
for
devicenode
in
runtimenode
.
device_node
:
name
=
devicenode
.
name
if
name
not
in
self
.
devices
:
self
.
devices
[
name
]
=
DeviceItem
(
name
)
self
.
devices
[
name
].
add_item
(
devicenode
)
class
OperatorParser
:
r
"""
Analyse operator event in profiling data, correlate with its device event.
"""
def
__init__
(
self
):
self
.
items
=
{}
# for operator summary
self
.
items_with_input_shape
=
collections
.
defaultdict
(
dict
)
def
parse
(
self
,
nodetrees
):
r
"""
Analysis operator event in the nodetress.
"""
node_statistic_trees
,
thread2host_statistic_nodes
=
wrap_tree
(
nodetrees
)
for
threadid
,
host_statistic_nodes
in
thread2host_statistic_nodes
.
items
(
):
for
host_statistic_node
in
host_statistic_nodes
[
1
:]:
# skip root node
if
host_statistic_node
.
type
==
'Operator'
:
self
.
add_operator_item
(
host_statistic_node
)
def
add_operator_item
(
self
,
operator_node
):
if
operator_node
.
name
not
in
self
.
items
:
self
.
items
[
operator_node
.
name
]
=
OperatorItem
(
operator_node
.
name
)
input_shape_str
=
self
.
_translate_op_input_shape_to_string
(
operator_node
.
input_shapes
)
if
input_shape_str
not
in
self
.
items_with_input_shape
[
operator_node
.
name
]:
self
.
items_with_input_shape
[
operator_node
.
name
][
input_shape_str
]
=
OperatorItem
(
operator_node
.
name
)
self
.
items
[
operator_node
.
name
].
add_item
(
operator_node
)
self
.
items_with_input_shape
[
operator_node
.
name
][
input_shape_str
].
add_item
(
operator_node
)
def
_translate_op_input_shape_to_string
(
self
,
input_shape
):
result
=
''
for
arg
,
shape
in
input_shape
.
items
():
result
+=
'{}-{}
\t
'
.
format
(
arg
,
shape
)
return
result
visualdl/component/profiler/parser/overview_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
collections
from
.utils
import
merge_ranges
from
.utils
import
merge_self_ranges
from
.utils
import
rebuild_node_trees
from
.utils
import
sum_ranges
from
.utils
import
traverse_tree
StageType
=
[
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
]
CPUType
=
[
'Operator'
,
'CudaRuntime'
,
'UserDefined'
,
'OperatorInner'
,
'Communication'
,
'PythonOp'
,
'PythonUserDefined'
,
'MluRuntime'
]
GPUType
=
[
'Kernel'
,
'Memcpy'
,
'Memset'
]
class
GeneralItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
cpu_time
=
0
self
.
max_cpu_time
=
0
self
.
min_cpu_time
=
float
(
'inf'
)
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
general_gpu_time
=
0
self
.
min_general_gpu_time
=
float
(
'inf'
)
self
.
max_general_gpu_time
=
0
@
property
def
avg_cpu_time
(
self
):
return
self
.
cpu_time
/
self
.
call
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
@
property
def
avg_general_gpu_time
(
self
):
return
self
.
general_gpu_time
/
self
.
call
def
add_cpu_time
(
self
,
time
):
if
time
>
self
.
max_cpu_time
:
self
.
max_cpu_time
=
time
if
time
<
self
.
min_cpu_time
:
self
.
min_cpu_time
=
time
self
.
cpu_time
+=
time
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_general_gpu_time
(
self
,
time
):
if
time
>
self
.
max_general_gpu_time
:
self
.
max_general_gpu_time
=
time
if
time
<
self
.
min_general_gpu_time
:
self
.
min_general_gpu_time
=
time
self
.
general_gpu_time
+=
time
def
add_call
(
self
):
self
.
call
+=
1
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
self
.
add_gpu_time
(
node
.
gpu_time
)
self
.
add_general_gpu_time
(
node
.
general_gpu_time
)
class
ModelPerspectiveItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
cpu_time
=
0
self
.
max_cpu_time
=
0
self
.
min_cpu_time
=
float
(
'inf'
)
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
cpu_times
=
{}
self
.
gpu_times
=
{}
@
property
def
avg_cpu_time
(
self
):
return
self
.
cpu_time
/
self
.
call
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
def
add_call
(
self
):
self
.
call
+=
1
def
add_cpu_time
(
self
,
time
):
self
.
add_call
()
if
time
>
self
.
max_cpu_time
:
self
.
max_cpu_time
=
time
if
time
<
self
.
min_cpu_time
:
self
.
min_cpu_time
=
time
self
.
cpu_time
+=
time
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
def
set_gpu_time
(
self
,
time
):
'''
Use this to set total gpu time in case gpu time calculated by add_gpu_time include overlap.
'''
self
.
gpu_time
=
time
class
OverviewParser
:
r
"""
Analyse time ranges for each TracerEventType, and summarize the time.
"""
def
__init__
(
self
):
# event name: GeneralItem
self
.
memory_manipulation_items
=
{}
# for memory manipulation summary
self
.
userdefined_items
=
{}
# for userdefined summary
self
.
model_perspective_items
=
{}
# phase name:
# device name:
# stage idx:
# thread name:
# event type:
# {"events" :[], "times": []}
self
.
events_per_stage
=
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
list
))))))
# phase name:
# device name:
# stage idx:
# event type:
# { "calls" :[], "times": [], "total_time": 0 }
self
.
merged_events_per_stage
=
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
lambda
:
collections
.
defaultdict
(
list
)))))
self
.
stage_nums
=
0
self
.
gpu_ulitization
=
0.0
self
.
has_forward
=
False
self
.
has_device
=
False
def
parse
(
self
,
nodetrees
):
# noqa: C901
r
"""
Analysis node trees in profiler result, and get time range for different tracer event type.
"""
self
.
_parse_events
(
nodetrees
)
# statistic calling times
# merge time, get time summarization
for
stage_name
,
stage_data
in
self
.
events_per_stage
.
items
():
for
device_name
,
steps_data
in
stage_data
.
items
():
for
step_idx
,
thread_data
in
steps_data
.
items
():
for
thread_id
,
events
in
thread_data
.
items
():
for
event_type
,
events_data
in
events
.
items
():
if
'calls'
not
in
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
]:
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
][
'calls'
]
=
0
if
'total_time'
not
in
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
]:
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
][
'total_time'
]
=
0
events_data
[
'times'
]
=
merge_self_ranges
(
events_data
[
'times'
],
is_sorted
=
False
)
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
][
'calls'
]
+=
len
(
events_data
[
'events'
])
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
][
'times'
]
=
\
merge_ranges
(
self
.
merged_events_per_stage
[
stage_name
][
device_name
][
step_idx
][
event_type
][
'times'
],
events_data
[
'times'
],
is_sorted
=
True
)
# merge different stages into profile step
stage_names
=
list
(
self
.
merged_events_per_stage
.
keys
())
self
.
merged_events_per_stage
[
'ProfileStep'
]
for
stage_name
in
stage_names
:
stage_data
=
self
.
merged_events_per_stage
[
stage_name
]
for
device_name
,
steps_data
in
stage_data
.
items
():
for
step_idx
,
events
in
steps_data
.
items
():
for
event_type
,
events_data
in
events
.
items
():
events_data
[
'total_time'
]
=
sum_ranges
(
events_data
[
'times'
])
if
'calls'
not
in
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
]:
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
][
'calls'
]
=
0
if
'total_time'
not
in
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
]:
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
][
'total_time'
]
=
0
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
][
'calls'
]
+=
events_data
[
'calls'
]
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
][
'total_time'
]
+=
events_data
[
'total_time'
]
self
.
merged_events_per_stage
[
'ProfileStep'
][
device_name
][
step_idx
][
event_type
][
'times'
]
=
merge_ranges
(
self
.
merged_events_per_stage
[
'ProfileStep'
]
[
device_name
][
step_idx
][
event_type
]
[
'times'
],
events_data
[
'times'
],
is_sorted
=
True
)
# add gpu time for model perspective summary
for
stage_name
,
stage_data
in
self
.
merged_events_per_stage
.
items
():
for
device_name
,
steps_data
in
stage_data
.
items
():
for
step_idx
,
events
in
steps_data
.
items
():
if
'Kernel'
in
events
:
if
step_idx
==
'ALL'
:
self
.
model_perspective_items
[
stage_name
].
set_gpu_time
(
events
[
'Kernel'
][
'total_time'
])
continue
self
.
model_perspective_items
[
stage_name
].
add_gpu_time
(
events
[
'Kernel'
][
'total_time'
])
self
.
model_perspective_items
[
stage_name
].
gpu_times
[
step_idx
]
=
events
[
'Kernel'
][
'total_time'
]
if
self
.
has_device
:
self
.
gpu_ulitization
=
self
.
merged_events_per_stage
[
'ProfileStep'
][
'GPU'
][
'ALL'
][
'Kernel'
][
'total_time'
]
/
self
.
model_perspective_items
[
'ProfileStep'
].
cpu_time
def
_fill_stage_events
(
# noqa: C901
self
,
node
,
stage_idx
,
should_recursive
=
True
):
if
node
.
type
==
'Forward'
:
stage_name
=
'Forward'
self
.
has_forward
=
True
elif
node
.
type
==
'Backward'
:
stage_name
=
'Backward'
elif
node
.
type
==
'Optimization'
:
stage_name
=
'Optimization'
elif
node
.
type
==
'Dataloader'
:
stage_name
=
'Dataloader'
else
:
stage_name
=
'Other'
if
should_recursive
:
stack
=
[]
if
node
.
type
in
StageType
:
for
children
in
node
.
children_node
:
stack
.
append
(
children
)
else
:
stack
.
append
(
node
)
while
stack
:
current_node
=
stack
.
pop
()
for
childnode
in
current_node
.
children_node
:
stack
.
append
(
childnode
)
for
runtimenode
in
current_node
.
runtime_node
:
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'events'
].
append
(
runtimenode
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'times'
].
append
(
(
runtimenode
.
start_ns
,
runtimenode
.
end_ns
))
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'events'
].
append
(
runtimenode
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'times'
].
append
(
(
runtimenode
.
start_ns
,
runtimenode
.
end_ns
))
for
devicenode
in
runtimenode
.
device_node
:
self
.
has_device
=
True
self
.
events_per_stage
[
stage_name
][
"GPU"
][
stage_idx
][
devicenode
.
stream_id
][
devicenode
.
type
][
'events'
].
append
(
devicenode
)
self
.
events_per_stage
[
stage_name
][
"GPU"
][
stage_idx
][
devicenode
.
stream_id
][
devicenode
.
type
][
'times'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
self
.
events_per_stage
[
stage_name
][
"GPU"
][
'ALL'
][
devicenode
.
stream_id
][
devicenode
.
type
][
'events'
].
append
(
devicenode
)
self
.
events_per_stage
[
stage_name
][
"GPU"
][
'ALL'
][
devicenode
.
stream_id
][
devicenode
.
type
][
'times'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
if
current_node
.
type
==
'Forward'
or
current_node
.
type
==
'UserDefined'
:
continue
node_type
=
current_node
.
type
if
node_type
==
'PythonUserDefined'
:
node_type
=
'UserDefined'
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
current_node
.
thread_id
][
node_type
][
'events'
].
append
(
current_node
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
current_node
.
thread_id
][
node_type
][
'times'
].
append
(
(
current_node
.
start_ns
,
current_node
.
end_ns
))
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
current_node
.
thread_id
][
node_type
][
'events'
].
append
(
current_node
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
current_node
.
thread_id
][
node_type
][
'times'
].
append
(
(
current_node
.
start_ns
,
current_node
.
end_ns
))
else
:
for
runtimenode
in
node
.
runtime_node
:
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'events'
].
append
(
runtimenode
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
stage_idx
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'times'
].
append
(
(
runtimenode
.
start_ns
,
runtimenode
.
end_ns
))
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'events'
].
append
(
runtimenode
)
self
.
events_per_stage
[
stage_name
][
"CPU"
][
'ALL'
][
runtimenode
.
thread_id
][
runtimenode
.
type
][
'times'
].
append
(
(
runtimenode
.
start_ns
,
runtimenode
.
end_ns
))
for
devicenode
in
runtimenode
.
device_node
:
self
.
has_device
=
True
self
.
events_per_stage
[
stage_name
][
"GPU"
][
stage_idx
][
devicenode
.
stream_id
][
devicenode
.
type
][
'events'
].
append
(
devicenode
)
self
.
events_per_stage
[
stage_name
][
"GPU"
][
stage_idx
][
devicenode
.
stream_id
][
devicenode
.
type
][
'times'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
self
.
events_per_stage
[
stage_name
][
"GPU"
][
'ALL'
][
devicenode
.
stream_id
][
devicenode
.
type
][
'events'
].
append
(
devicenode
)
self
.
events_per_stage
[
stage_name
][
"GPU"
][
'ALL'
][
devicenode
.
stream_id
][
devicenode
.
type
][
'times'
].
append
(
(
devicenode
.
start_ns
,
devicenode
.
end_ns
))
def
_parse_events
(
self
,
nodetrees
):
node_wrapped_trees
=
rebuild_node_trees
(
nodetrees
)
node_wrapped_threadlist
=
traverse_tree
(
node_wrapped_trees
)
# analyse user-defined summary
for
threadid
,
wrapped_nodes
in
node_wrapped_threadlist
.
items
():
for
wrapped_node
in
wrapped_nodes
[
1
:]:
# skip root node
if
wrapped_node
.
type
==
'PythonUserDefined'
:
self
.
add_userdefined_item
(
wrapped_node
)
# analyse all events in per stage
thread_count
=
0
for
threadid
,
root_wrapped_node
in
node_wrapped_trees
.
items
():
thread_count
+=
1
wrapped_profiler_step_nodes
=
[]
for
wrapped_node
in
root_wrapped_node
.
children_node
:
wrapped_profiler_step_nodes
.
append
(
wrapped_node
)
self
.
stage_nums
=
0
current_stage_idx
=
None
for
wrapped_profiler_step_node
in
wrapped_profiler_step_nodes
:
if
wrapped_profiler_step_node
.
type
==
'ProfileStep'
:
self
.
process_id
=
wrapped_profiler_step_node
.
process_id
stage_idx
=
wrapped_profiler_step_node
.
name
.
split
(
'#'
)[
1
]
total_time
=
0
accumulated_stage_time
=
0
if
thread_count
==
1
:
self
.
add_model_perspective_item
(
wrapped_profiler_step_node
)
self
.
model_perspective_items
[
'ProfileStep'
].
cpu_times
[
stage_idx
]
=
wrapped_profiler_step_node
.
cpu_time
total_time
=
wrapped_profiler_step_node
.
cpu_time
self
.
stage_nums
+=
1
for
stage_wrapped_node
in
wrapped_profiler_step_node
.
children_node
:
if
thread_count
==
1
:
self
.
add_model_perspective_item
(
stage_wrapped_node
)
if
stage_wrapped_node
.
type
in
StageType
:
self
.
model_perspective_items
[
stage_wrapped_node
.
type
].
cpu_times
[
stage_idx
]
=
stage_wrapped_node
.
cpu_time
if
stage_wrapped_node
.
type
in
StageType
:
accumulated_stage_time
+=
stage_wrapped_node
.
cpu_time
self
.
_fill_stage_events
(
stage_wrapped_node
,
stage_idx
)
if
'Other'
not
in
self
.
model_perspective_items
:
self
.
model_perspective_items
[
'Other'
]
=
ModelPerspectiveItem
(
'Other'
)
if
thread_count
==
1
:
self
.
model_perspective_items
[
'Other'
].
add_cpu_time
(
total_time
-
accumulated_stage_time
)
self
.
model_perspective_items
[
'Other'
].
cpu_times
[
stage_idx
]
=
total_time
-
accumulated_stage_time
self
.
_fill_stage_events
(
wrapped_profiler_step_node
,
stage_idx
,
should_recursive
=
False
)
else
:
self
.
_fill_stage_events
(
wrapped_profiler_step_node
,
current_stage_idx
)
self
.
_fill_stage_events
(
root_wrapped_node
,
current_stage_idx
,
should_recursive
=
False
)
def
add_userdefined_item
(
self
,
userdefined_node
):
if
userdefined_node
.
name
not
in
self
.
userdefined_items
:
self
.
userdefined_items
[
userdefined_node
.
name
]
=
GeneralItem
(
userdefined_node
.
name
)
self
.
userdefined_items
[
userdefined_node
.
name
].
add_item
(
userdefined_node
)
def
add_memory_manipulation_item
(
self
,
memory_manipulation_node
):
if
memory_manipulation_node
.
name
not
in
self
.
memory_manipulation_items
:
self
.
memory_manipulation_items
[
memory_manipulation_node
.
name
]
=
GeneralItem
(
memory_manipulation_node
.
name
)
self
.
memory_manipulation_items
[
memory_manipulation_node
.
name
].
add_item
(
memory_manipulation_node
)
def
add_model_perspective_item
(
self
,
model_perspective_node
):
if
model_perspective_node
.
type
==
'Forward'
:
name
=
'Forward'
elif
model_perspective_node
.
type
==
'Backward'
:
name
=
'Backward'
elif
model_perspective_node
.
type
==
'Optimization'
:
name
=
'Optimization'
elif
model_perspective_node
.
type
==
'Dataloader'
:
name
=
'Dataloader'
elif
model_perspective_node
.
type
==
'ProfileStep'
:
name
=
'ProfileStep'
else
:
return
if
name
not
in
self
.
model_perspective_items
:
self
.
model_perspective_items
[
name
]
=
ModelPerspectiveItem
(
name
)
self
.
model_perspective_items
[
name
].
add_cpu_time
(
model_perspective_node
.
cpu_time
)
visualdl/component/profiler/parser/trace_parser.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
class
TraceParser
:
def
__init__
(
self
):
pass
def
parse
(
self
,
content
):
self
.
content
=
content
visualdl/component/profiler/parser/utils.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
collections
StageType
=
[
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
]
def
sum_ranges
(
ranges
):
result
=
0
for
time_range
in
ranges
:
result
+=
(
time_range
[
1
]
-
time_range
[
0
])
return
result
def
merge_self_ranges
(
src_ranges
,
is_sorted
=
False
):
merged_ranges
=
[]
if
len
(
src_ranges
)
>
0
:
if
not
is_sorted
:
src_ranges
.
sort
(
key
=
lambda
x
:
x
[
0
])
cur_indx
=
0
merged_ranges
.
append
((
src_ranges
[
cur_indx
][
0
],
src_ranges
[
cur_indx
][
1
]))
for
cur_indx
in
range
(
1
,
len
(
src_ranges
)):
if
src_ranges
[
cur_indx
][
1
]
>
merged_ranges
[
-
1
][
1
]:
if
src_ranges
[
cur_indx
][
0
]
<=
merged_ranges
[
-
1
][
1
]:
merged_ranges
[
-
1
]
=
(
merged_ranges
[
-
1
][
0
],
src_ranges
[
cur_indx
][
1
])
else
:
merged_ranges
.
append
((
src_ranges
[
cur_indx
][
0
],
src_ranges
[
cur_indx
][
1
]))
return
merged_ranges
def
merge_ranges
(
range_list1
,
range_list2
,
is_sorted
=
False
):
# noqa:C901
merged_ranges
=
[]
if
not
is_sorted
:
range_list1
=
merge_self_ranges
(
range_list1
)
range_list2
=
merge_self_ranges
(
range_list2
)
len1
=
len
(
range_list1
)
len2
=
len
(
range_list2
)
if
len1
==
0
and
len2
==
0
:
return
merged_ranges
elif
len1
==
0
:
return
range_list2
elif
len2
==
0
:
return
range_list1
else
:
indx1
=
0
indx2
=
0
range1
=
range_list1
[
indx1
]
range2
=
range_list2
[
indx2
]
if
range1
[
0
]
<
range2
[
0
]:
merged_ranges
.
append
(
range1
)
indx1
+=
1
else
:
merged_ranges
.
append
(
range2
)
indx2
+=
1
while
indx1
<
len1
and
indx2
<
len2
:
range1
=
range_list1
[
indx1
]
range2
=
range_list2
[
indx2
]
if
range1
[
0
]
<
range2
[
0
]:
if
range1
[
1
]
>
merged_ranges
[
-
1
][
1
]:
if
range1
[
0
]
<=
merged_ranges
[
-
1
][
1
]:
merged_ranges
[
-
1
]
=
(
merged_ranges
[
-
1
][
0
],
range1
[
1
])
else
:
merged_ranges
.
append
((
range1
[
0
],
range1
[
1
]))
indx1
+=
1
else
:
indx1
+=
1
else
:
if
range2
[
1
]
>
merged_ranges
[
-
1
][
1
]:
if
range2
[
0
]
<=
merged_ranges
[
-
1
][
1
]:
merged_ranges
[
-
1
]
=
(
merged_ranges
[
-
1
][
0
],
range2
[
1
])
else
:
merged_ranges
.
append
((
range2
[
0
],
range2
[
1
]))
indx2
+=
1
else
:
indx2
+=
1
while
indx1
<
len1
:
range1
=
range_list1
[
indx1
]
if
range1
[
1
]
>
merged_ranges
[
-
1
][
1
]:
if
range1
[
0
]
<=
merged_ranges
[
-
1
][
1
]:
merged_ranges
[
-
1
]
=
(
merged_ranges
[
-
1
][
0
],
range1
[
1
])
else
:
merged_ranges
.
append
((
range1
[
0
],
range1
[
1
]))
indx1
+=
1
else
:
indx1
+=
1
while
indx2
<
len2
:
range2
=
range_list2
[
indx2
]
if
range2
[
1
]
>
merged_ranges
[
-
1
][
1
]:
if
range2
[
0
]
<=
merged_ranges
[
-
1
][
1
]:
merged_ranges
[
-
1
]
=
(
merged_ranges
[
-
1
][
0
],
range2
[
1
])
else
:
merged_ranges
.
append
((
range2
[
0
],
range2
[
1
]))
indx2
+=
1
else
:
indx2
+=
1
return
merged_ranges
def
intersection_ranges
(
range_list1
,
range_list2
,
is_sorted
=
False
):
result_range
=
[]
if
len
(
range_list1
)
==
0
or
len
(
range_list2
)
==
0
:
return
result_range
if
not
is_sorted
:
range_list1
=
merge_self_ranges
(
range_list1
)
range_list2
=
merge_self_ranges
(
range_list2
)
len1
=
len
(
range_list1
)
len2
=
len
(
range_list2
)
indx1
=
0
indx2
=
0
range1
=
range_list1
[
indx1
]
range2
=
range_list2
[
indx2
]
while
indx1
<
len1
and
indx2
<
len2
:
if
range2
[
1
]
<=
range1
[
0
]:
indx2
+=
1
if
indx2
==
len2
:
break
range2
=
range_list2
[
indx2
]
elif
range2
[
0
]
<=
range1
[
0
]
and
range2
[
1
]
<
range1
[
1
]:
assert
(
range2
[
1
]
>
range1
[
0
])
result_range
.
append
((
range1
[
0
],
range2
[
1
]))
range1
=
(
range2
[
1
],
range1
[
1
])
indx2
+=
1
if
indx2
==
len2
:
break
range2
=
range_list2
[
indx2
]
elif
range2
[
0
]
<=
range1
[
0
]:
assert
(
range2
[
1
]
>=
range1
[
1
])
result_range
.
append
(
range1
)
range2
=
(
range1
[
1
],
range2
[
1
])
indx1
+=
1
if
indx1
==
len1
:
break
range1
=
range_list1
[
indx1
]
elif
range2
[
1
]
<
range1
[
1
]:
assert
(
range2
[
0
]
>
range1
[
0
])
result_range
.
append
(
range2
)
range1
=
(
range2
[
1
],
range1
[
1
])
indx2
+=
1
if
indx2
==
len2
:
break
range2
=
range_list2
[
indx2
]
elif
range2
[
0
]
<
range1
[
1
]:
assert
(
range2
[
1
]
>=
range1
[
1
])
result_range
.
append
((
range2
[
0
],
range1
[
1
]))
range2
=
(
range1
[
1
],
range2
[
1
])
indx1
+=
1
if
indx1
==
len1
:
break
range1
=
range_list1
[
indx1
]
else
:
assert
(
range2
[
0
]
>=
range1
[
1
])
indx1
+=
1
if
indx1
==
len1
:
break
range1
=
range_list1
[
indx1
]
return
result_range
def
subtract_ranges
(
range_list1
,
range_list2
,
is_sorted
=
False
):
result_range
=
[]
if
not
is_sorted
:
range_list1
=
merge_self_ranges
(
range_list1
)
range_list2
=
merge_self_ranges
(
range_list2
)
if
len
(
range_list1
)
==
0
:
return
result_range
if
len
(
range_list2
)
==
0
:
return
range_list1
len1
=
len
(
range_list1
)
len2
=
len
(
range_list2
)
indx1
=
0
indx2
=
0
range1
=
range_list1
[
indx1
]
range2
=
range_list2
[
indx2
]
while
indx1
<
len
(
range_list1
):
if
indx2
==
len
(
range_list2
):
result_range
.
append
(
range1
)
indx1
+=
1
if
indx1
==
len1
:
break
range1
=
range_list1
[
indx1
]
elif
range2
[
1
]
<=
range1
[
0
]:
indx2
+=
1
if
indx2
!=
len2
:
range2
=
range_list2
[
indx2
]
elif
range2
[
0
]
<=
range1
[
0
]
and
range2
[
1
]
<
range1
[
1
]:
range1
=
(
range2
[
1
],
range1
[
1
])
indx2
+=
1
if
indx2
!=
len2
:
range2
=
range_list2
[
indx2
]
elif
range2
[
0
]
<=
range1
[
0
]:
assert
(
range2
[
1
]
>=
range1
[
1
])
range2
=
(
range1
[
1
],
range2
[
1
])
indx1
+=
1
if
indx1
!=
len1
:
range1
=
range_list1
[
indx1
]
elif
range2
[
0
]
<
range1
[
1
]:
assert
(
range2
[
0
]
>
range1
[
0
])
result_range
.
append
((
range1
[
0
],
range2
[
0
]))
range1
=
(
range2
[
0
],
range1
[
1
])
else
:
assert
(
range2
[
0
]
>=
range1
[
1
])
result_range
.
append
(
range1
)
indx1
+=
1
if
indx1
!=
len1
:
range1
=
range_list1
[
indx1
]
return
result_range
class
HostStatisticNode
:
r
'''
Wrap original node for calculating statistic metrics.
'''
def
__init__
(
self
,
hostnode
):
self
.
hostnode
=
hostnode
self
.
children_node
=
[]
self
.
runtime_node
=
[]
self
.
cpu_time
=
0
self
.
self_cpu_time
=
0
self
.
gpu_time
=
0
# kernel time
self
.
self_gpu_time
=
0
self
.
general_gpu_time
=
0
# besides kernel, include time of gpu events like memcpy and memset
self
.
self_general_gpu_time
=
0
self
.
is_terminal_operator_node
=
True
def
cal_statistic
(
self
):
for
child
in
self
.
children_node
:
child
.
cal_statistic
()
if
child
.
is_terminal_operator_node
is
False
:
self
.
is_terminal_operator_node
=
False
for
rt
in
self
.
runtime_node
:
rt
.
cal_statistic
()
self
.
cpu_time
=
self
.
hostnode
.
end_ns
-
self
.
hostnode
.
start_ns
self
.
self_cpu_time
=
self
.
cpu_time
for
child
in
self
.
children_node
:
if
child
.
type
==
'Operator'
:
self
.
is_terminal_operator_node
=
False
self
.
gpu_time
+=
child
.
gpu_time
self
.
general_gpu_time
+=
child
.
general_gpu_time
self
.
self_cpu_time
-=
(
child
.
end_ns
-
child
.
start_ns
)
for
rt
in
self
.
runtime_node
:
self
.
self_cpu_time
-=
(
rt
.
end_ns
-
rt
.
start_ns
)
self
.
gpu_time
+=
rt
.
gpu_time
self
.
self_gpu_time
+=
rt
.
gpu_time
self
.
general_gpu_time
+=
rt
.
general_gpu_time
self
.
self_general_gpu_time
+=
rt
.
general_gpu_time
for
device
in
self
.
hostnode
.
device_node
:
if
device
.
type
==
'Kernel'
:
self
.
gpu_time
+=
(
device
.
end_ns
-
device
.
start_ns
)
self
.
self_gpu_time
+=
(
device
.
end_ns
-
device
.
start_ns
)
self
.
general_gpu_time
+=
(
device
.
end_ns
-
device
.
start_ns
)
self
.
self_general_gpu_time
+=
(
device
.
end_ns
-
device
.
start_ns
)
@
property
def
end_ns
(
self
):
return
self
.
hostnode
.
end_ns
@
property
def
start_ns
(
self
):
return
self
.
hostnode
.
start_ns
def
__getattr__
(
self
,
name
):
return
getattr
(
self
.
hostnode
,
name
)
def
traverse_tree
(
nodetrees
):
results
=
collections
.
defaultdict
(
list
)
for
thread_id
,
rootnode
in
nodetrees
.
items
():
stack
=
[]
stack
.
append
(
rootnode
)
threadlist
=
results
[
thread_id
]
while
stack
:
current_node
=
stack
.
pop
()
threadlist
.
append
(
current_node
)
for
childnode
in
current_node
.
children_node
:
stack
.
append
(
childnode
)
return
results
def
get_device_nodes
(
hostnode
):
'''
Get all device nodes called in the time range of hostnode.
'''
stack
=
[]
device_nodes
=
[]
stack
.
append
(
hostnode
)
while
stack
:
current_node
=
stack
.
pop
()
for
childnode
in
current_node
.
children_node
:
stack
.
append
(
childnode
)
for
runtimenode
in
current_node
.
runtime_node
:
for
devicenode
in
runtimenode
.
device_node
:
device_nodes
.
append
(
devicenode
)
return
device_nodes
def
wrap_tree
(
nodetrees
):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
'''
node_statistic_tree
=
{}
results
=
collections
.
defaultdict
(
list
)
newresults
=
collections
.
defaultdict
(
list
)
for
thread_id
,
rootnode
in
nodetrees
.
items
():
stack
=
[]
stack
.
append
(
rootnode
)
root_statistic_node
=
HostStatisticNode
(
rootnode
)
newstack
=
[]
newstack
.
append
(
root_statistic_node
)
node_statistic_tree
[
thread_id
]
=
root_statistic_node
threadlist
=
results
[
thread_id
]
newthreadlist
=
newresults
[
thread_id
]
while
stack
:
current_node
=
stack
.
pop
()
threadlist
.
append
(
current_node
)
current_statistic_node
=
newstack
.
pop
()
newthreadlist
.
append
(
current_statistic_node
)
for
childnode
in
current_node
.
children_node
:
stack
.
append
(
childnode
)
child_statistic_node
=
HostStatisticNode
(
childnode
)
current_statistic_node
.
children_node
.
append
(
child_statistic_node
)
newstack
.
append
(
child_statistic_node
)
for
runtimenode
in
current_node
.
runtime_node
:
runtime_statistic_node
=
HostStatisticNode
(
runtimenode
)
current_statistic_node
.
runtime_node
.
append
(
runtime_statistic_node
)
# recursive calculate node statistic values
for
thread_id
,
root_statistic_node
in
node_statistic_tree
.
items
():
root_statistic_node
.
cal_statistic
()
return
node_statistic_tree
,
newresults
def
rebuild_node_trees
(
nodetrees
):
# noqa:C901
template_root
=
None
# First, we find the tree which includes Forward event.
for
threadid
,
root
in
nodetrees
.
items
():
has_find_template_root
=
False
template_root
=
HostStatisticNode
(
root
)
for
children
in
root
.
children_node
:
if
children
.
type
==
'ProfileStep'
:
profiler_step_node
=
HostStatisticNode
(
children
)
template_root
.
children_node
.
append
(
profiler_step_node
)
has_find_template_root
=
True
for
stage_node
in
children
.
children_node
:
if
stage_node
.
type
in
StageType
:
profiler_step_node
.
children_node
.
append
(
HostStatisticNode
(
stage_node
))
else
:
break
if
has_find_template_root
is
True
:
break
if
template_root
is
None
:
print
(
'No profiler steps found, overview page will have no data.'
)
wrapped_tree
=
{}
for
thread_id
,
rootnode
in
nodetrees
.
items
():
has_find_template_root
=
False
for
children
in
rootnode
.
children_node
:
if
children
.
type
==
'ProfileStep'
:
has_find_template_root
=
True
break
unwrapped_stack
=
[]
warpped_stack
=
[]
root_statistic_node
=
HostStatisticNode
(
rootnode
)
wrapped_tree
[
thread_id
]
=
root_statistic_node
if
has_find_template_root
is
False
:
for
profiler_step_node
in
template_root
.
children_node
:
profiler_step_wrap_node
=
HostStatisticNode
(
profiler_step_node
.
hostnode
)
root_statistic_node
.
children_node
.
append
(
profiler_step_wrap_node
)
for
stage_node
in
profiler_step_node
.
children_node
:
stage_wrap_node
=
HostStatisticNode
(
stage_node
.
hostnode
)
profiler_step_wrap_node
.
children_node
.
append
(
stage_wrap_node
)
# insert nodes in original root into new stage nodes
# algorithm: post order traversal the tree
stack
=
[]
flag_stack
=
[]
post_order_nodes
=
[]
stack
.
append
(
root_statistic_node
)
flag_stack
.
append
(
0
)
while
stack
:
current_node
=
stack
.
pop
()
flag
=
flag_stack
.
pop
()
if
flag
==
0
:
stack
.
append
(
current_node
)
flag_stack
.
append
(
1
)
for
children_node
in
reversed
(
current_node
.
children_node
):
stack
.
append
(
children_node
)
flag_stack
.
append
(
0
)
else
:
post_order_nodes
.
append
(
current_node
)
# traverse post_order_nodes and insert right position
for
runtimenode
in
rootnode
.
runtime_node
:
runtime_wrapped_node
=
HostStatisticNode
(
runtimenode
)
root_statistic_node
.
runtime_node
.
append
(
runtime_wrapped_node
)
for
node
in
rootnode
.
children_node
:
unwrapped_stack
.
append
(
node
)
for
wrapped_node
in
post_order_nodes
:
if
node
.
start_ns
>=
wrapped_node
.
start_ns
and
node
.
end_ns
<=
wrapped_node
.
end_ns
:
child_wrapped_node
=
HostStatisticNode
(
node
)
warpped_stack
.
append
(
child_wrapped_node
)
wrapped_node
.
children_node
.
append
(
child_wrapped_node
)
break
else
:
unwrapped_stack
.
append
(
rootnode
)
warpped_stack
.
append
(
root_statistic_node
)
while
unwrapped_stack
:
current_node
=
unwrapped_stack
.
pop
()
current_wrapped_node
=
warpped_stack
.
pop
()
for
childnode
in
current_node
.
children_node
:
unwrapped_stack
.
append
(
childnode
)
child_wrapped_node
=
HostStatisticNode
(
childnode
)
current_wrapped_node
.
children_node
.
append
(
child_wrapped_node
)
warpped_stack
.
append
(
child_wrapped_node
)
for
runtimenode
in
current_node
.
runtime_node
:
runtime_wrapped_node
=
HostStatisticNode
(
runtimenode
)
current_wrapped_node
.
runtime_node
.
append
(
runtime_wrapped_node
)
# recursive calculate node statistic values
for
thread_id
,
root_wrapped_node
in
wrapped_tree
.
items
():
root_wrapped_node
.
cal_statistic
()
return
wrapped_tree
def
format_time
(
time
,
unit
=
'ms'
,
inf_subs
=
'-'
):
r
"""
Transform time in ns to time in unit.
"""
if
time
==
float
(
'inf'
):
return
inf_subs
else
:
result
=
float
(
time
)
if
unit
==
's'
:
result
/=
1e9
elif
unit
==
'ms'
:
result
/=
1e6
elif
unit
==
'us'
:
result
/=
1e3
# return '{:.2f}'.format(result)
return
round
(
result
,
2
)
def
format_ratio
(
ratio
):
r
"""
Transform ratio within [0, 1] to percentage presentation.
"""
# return '{:.2f}'.format(ratio * 100)
return
round
(
ratio
*
100
,
2
)
def
format_float
(
float_data
):
return
round
(
float_data
,
2
)
def
format_memory
(
memory
,
memory_unit
=
'KB'
):
result
=
float
(
memory
)
if
memory_unit
==
'GB'
:
result
/=
(
1024
*
1024
*
1024
)
elif
memory_unit
==
'MB'
:
result
/=
(
1024
*
1024
)
elif
memory_unit
==
'KB'
:
result
/=
1024
# return '{:.2f}'.format(result)
return
round
(
result
,
2
)
visualdl/component/profiler/profiler_data.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from
collections
import
defaultdict
from
collections
import
OrderedDict
from
.parser.distributed_parser
import
DistributedParser
from
.parser.kernel_parser
import
KernelParser
from
.parser.memory_parser
import
MemoryParser
from
.parser.operator_parser
import
OperatorParser
from
.parser.overview_parser
import
CPUType
from
.parser.overview_parser
import
GPUType
from
.parser.overview_parser
import
OverviewParser
from
.parser.trace_parser
import
TraceParser
from
.parser.utils
import
format_float
from
.parser.utils
import
format_memory
from
.parser.utils
import
format_ratio
from
.parser.utils
import
format_time
from
.parser.utils
import
traverse_tree
def
filter_type
(
node_trees
):
nodelists
=
traverse_tree
(
node_trees
)
for
thread_id
,
nodelist
in
nodelists
.
items
():
for
node
in
nodelist
:
if
not
isinstance
(
node
.
type
,
str
):
node
.
type
=
str
(
node
.
type
).
split
(
'.'
)[
1
]
class
ProfilerData
:
'''
Hold all parsed data to serve for user requests.
'''
def
__init__
(
self
,
run
,
worker_name
,
span_indx
,
profiler_result
):
self
.
run
=
run
self
.
worker_name
=
worker_name
self
.
span_indx
=
span_indx
self
.
node_trees
=
profiler_result
.
get_data
()
filter_type
(
self
.
node_trees
)
self
.
extra_infos
=
profiler_result
.
get_extra_info
()
self
.
span_idx
=
profiler_result
.
get_span_idx
()
self
.
device_infos
=
profiler_result
.
get_device_infos
()
self
.
overview_parser
=
None
self
.
operator_parser
=
None
self
.
distributed_parser
=
None
self
.
memory_parser
=
None
self
.
kernel_parser
=
None
self
.
trace_parser
=
None
self
.
has_gpu
=
profiler_result
.
has_device
()
if
profiler_result
.
has_host
():
# overview parser
self
.
overview_parser
=
OverviewParser
()
self
.
overview_parser
.
parse
(
self
.
node_trees
)
self
.
merged_events_per_stage
=
self
.
overview_parser
.
merged_events_per_stage
self
.
model_perspective_items
=
self
.
overview_parser
.
model_perspective_items
self
.
userdefined_items
=
self
.
overview_parser
.
userdefined_items
self
.
gpu_ulitization
=
self
.
overview_parser
.
gpu_ulitization
self
.
process_id
=
self
.
overview_parser
.
process_id
# operator parser
self
.
operator_parser
=
OperatorParser
()
self
.
operator_parser
.
parse
(
self
.
node_trees
)
self
.
operator_items
=
self
.
operator_parser
.
items
self
.
operator_items_with_input_shape
=
self
.
operator_parser
.
items_with_input_shape
# distributed parser
if
profiler_result
.
has_device
():
self
.
distributed_parser
=
DistributedParser
()
self
.
distributed_parser
.
parse
(
self
.
node_trees
)
self
.
distributed_time
=
self
.
distributed_parser
.
steps_time
if
profiler_result
.
has_memory
():
# memory parser
self
.
memory_parser
=
MemoryParser
()
self
.
memory_parser
.
parse
(
self
.
node_trees
)
self
.
memory_curve
=
self
.
memory_parser
.
memory_curve
self
.
allocated_items
=
self
.
memory_parser
.
allocated_items
self
.
reserved_items
=
self
.
memory_parser
.
reserved_items
self
.
paired_events
=
self
.
memory_parser
.
paired_events
self
.
size_ranges
=
self
.
memory_parser
.
size_ranges
self
.
peak_allocation_values
=
self
.
memory_parser
.
peak_allocation_values
if
profiler_result
.
has_device
():
# kernel parser
self
.
kernel_parser
=
KernelParser
()
self
.
kernel_parser
.
parse
(
traverse_tree
(
self
.
node_trees
))
self
.
kernel_items
=
self
.
kernel_parser
.
kernel_items
self
.
kernel_items_with_op_name_attributes
=
self
.
kernel_parser
.
kernel_items_with_op_name_attributes
self
.
occupancy
=
self
.
kernel_parser
.
occupancy
self
.
sm_efficiency
=
self
.
kernel_parser
.
sm_efficiency
self
.
tensorcore_ratio
=
self
.
kernel_parser
.
tensor_core_ratio
self
.
gpu_ids
=
self
.
kernel_parser
.
gpu_ids
# trace parser
self
.
trace_parser
=
TraceParser
()
self
.
trace_parser
.
parse
(
profiler_result
.
content
)
def
get_views
(
self
):
'''
Return available views this profile data can provide.
'''
views
=
[]
if
self
.
overview_parser
:
if
self
.
overview_parser
.
has_forward
:
views
.
append
(
'Overview'
)
if
self
.
operator_parser
:
if
self
.
operator_items
:
views
.
append
(
'Operator'
)
if
self
.
kernel_parser
:
if
self
.
kernel_items
:
views
.
append
(
'GPU Kernel'
)
if
self
.
memory_parser
:
if
self
.
memory_curve
:
views
.
append
(
'Memory'
)
if
self
.
distributed_parser
:
if
self
.
distributed_time
:
views
.
append
(
'Distributed'
)
views
.
append
(
'Trace'
)
return
views
def
get_device_infos
(
self
):
if
not
self
.
overview_parser
.
has_device
:
device_type
=
'CPU'
return
{
"device_type"
:
device_type
,
"CPU"
:
{
"process_utilization"
:
format_ratio
(
float
(
self
.
extra_infos
[
"Process Cpu Utilization"
])),
"system_utilization"
:
format_ratio
(
float
(
self
.
extra_infos
[
"System Cpu Utilization"
]))
}
}
else
:
device_type
=
'GPU'
gpu_id
=
int
(
next
(
iter
(
self
.
gpu_ids
)))
return
{
"device_type"
:
device_type
,
"CPU"
:
{
"process_utilization"
:
format_ratio
(
float
(
self
.
extra_infos
[
"Process Cpu Utilization"
])),
"system_utilization"
:
format_ratio
(
float
(
self
.
extra_infos
[
"System Cpu Utilization"
]))
},
"GPU"
:
{
"name"
:
self
.
device_infos
[
gpu_id
][
'name'
],
"memory"
:
"{} GB"
.
format
(
format_memory
(
self
.
device_infos
[
gpu_id
][
'totalGlobalMem'
],
'GB'
)),
"compute_capability"
:
'{}.{}'
.
format
(
self
.
device_infos
[
gpu_id
][
'computeMajor'
],
self
.
device_infos
[
gpu_id
][
'computeMinor'
]),
"utilization"
:
format_ratio
(
self
.
gpu_ulitization
),
"sm_efficiency"
:
format_ratio
(
self
.
sm_efficiency
/
self
.
model_perspective_items
[
'ProfileStep'
].
cpu_time
),
"achieved_occupancy"
:
format_ratio
(
self
.
occupancy
),
"tensor_core_percentage"
:
format_ratio
(
self
.
tensorcore_ratio
)
}
}
def
get_model_perspective
(
self
,
time_unit
):
'''
Get total cpu and gpu statistics for model perspective of each profiler step.
'''
data
=
OrderedDict
()
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"ratio"
]
data
[
'cpu'
]
=
[]
if
self
.
overview_parser
.
has_device
:
data
[
'gpu'
]
=
[]
total_cpu_time
=
self
.
model_perspective_items
[
'ProfileStep'
].
cpu_time
total_gpu_time
=
self
.
model_perspective_items
[
'ProfileStep'
].
gpu_time
for
stage_name
in
[
'ProfileStep'
,
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
,
'Other'
]:
if
stage_name
in
self
.
model_perspective_items
:
cpu_stage_data
=
OrderedDict
()
cpu_stage_data
[
'name'
]
=
stage_name
cpu_stage_data
[
'calls'
]
=
self
.
model_perspective_items
[
stage_name
].
call
cpu_stage_data
[
'total_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
cpu_time
,
time_unit
)
cpu_stage_data
[
'avg_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
avg_cpu_time
,
time_unit
)
cpu_stage_data
[
'max_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
max_cpu_time
,
time_unit
)
cpu_stage_data
[
'min_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
min_cpu_time
,
time_unit
,
inf_subs
=
0
)
cpu_stage_data
[
'ratio'
]
=
format_ratio
(
self
.
model_perspective_items
[
stage_name
].
cpu_time
/
total_cpu_time
)
if
self
.
overview_parser
.
has_device
:
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
stage_name
gpu_stage_data
[
'calls'
]
=
self
.
model_perspective_items
[
stage_name
].
call
gpu_stage_data
[
'total_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
self
.
model_perspective_items
[
stage_name
].
min_gpu_time
,
time_unit
,
inf_subs
=
0
)
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
self
.
model_perspective_items
[
stage_name
].
gpu_time
/
total_gpu_time
)
data
[
'cpu'
].
append
(
cpu_stage_data
)
if
self
.
overview_parser
.
has_device
:
data
[
'gpu'
].
append
(
gpu_stage_data
)
return
data
def
get_model_perspective_perstep
(
self
,
device_type
,
time_unit
):
try
:
data
=
OrderedDict
()
data
[
'order'
]
=
[]
steps
=
[
int
(
step_id
)
for
step_id
in
self
.
model_perspective_items
[
'ProfileStep'
].
cpu_times
.
keys
()
]
steps
=
sorted
(
steps
)
data
[
'steps'
]
=
steps
for
stage_name
in
[
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
,
'Other'
]:
if
stage_name
not
in
self
.
model_perspective_items
:
continue
data
[
'order'
].
append
(
stage_name
)
data
[
stage_name
]
=
[]
for
stage_idx
in
steps
:
stage_idx
=
str
(
stage_idx
)
if
device_type
==
'cpu'
:
if
stage_idx
in
self
.
model_perspective_items
[
stage_name
].
cpu_times
:
data
[
stage_name
].
append
(
format_time
(
self
.
model_perspective_items
[
stage_name
].
cpu_times
[
stage_idx
],
time_unit
))
else
:
data
[
stage_name
].
append
(
0
)
else
:
if
stage_idx
in
self
.
model_perspective_items
[
stage_name
].
gpu_times
:
data
[
stage_name
].
append
(
format_time
(
self
.
model_perspective_items
[
stage_name
].
gpu_times
[
stage_idx
],
time_unit
))
else
:
data
[
stage_name
].
append
(
0
)
except
Exception
as
e
:
print
(
'error in get_model_perspective_perstep'
,
e
)
new_data
=
{}
new_data
[
'order'
]
=
data
[
'order'
]
new_data
[
'steps'
]
=
data
[
'steps'
]
new_data
[
'data'
]
=
[]
for
name
in
new_data
[
'order'
]:
new_data
[
'data'
].
append
(
data
[
name
])
return
new_data
def
get_event_type_perspective
(
self
,
device_type
,
time_unit
):
data
=
OrderedDict
()
data
[
'order'
]
=
[]
if
device_type
==
'cpu'
:
for
event_type
in
CPUType
:
event_type_data
=
{}
event_type_data
[
'calling_times'
]
=
{}
event_type_data
[
'calling_times'
][
'key'
]
=
[]
event_type_data
[
'calling_times'
][
'value'
]
=
[]
event_type_data
[
'durations'
]
=
{}
event_type_data
[
'durations'
][
'key'
]
=
[]
event_type_data
[
'durations'
][
'value'
]
=
[]
event_type_data
[
'ratios'
]
=
{}
event_type_data
[
'ratios'
][
'key'
]
=
[]
event_type_data
[
'ratios'
][
'value'
]
=
[]
for
stage_name
in
[
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
,
'Other'
]:
if
stage_name
in
self
.
merged_events_per_stage
:
if
event_type
in
self
.
merged_events_per_stage
[
stage_name
][
'CPU'
][
'ALL'
]:
event_type_data
[
'calling_times'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'durations'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'ratios'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'calling_times'
][
'value'
].
append
(
self
.
merged_events_per_stage
[
stage_name
][
'CPU'
]
[
'ALL'
][
event_type
][
'calls'
])
event_type_data
[
'durations'
][
'value'
].
append
(
format_time
(
self
.
merged_events_per_stage
[
stage_name
]
[
'CPU'
][
'ALL'
][
event_type
][
'total_time'
],
time_unit
))
event_type_data
[
'ratios'
][
'value'
].
append
(
format_ratio
(
self
.
merged_events_per_stage
[
stage_name
]
[
'CPU'
][
'ALL'
][
event_type
][
'total_time'
]
/
self
.
merged_events_per_stage
[
'ProfileStep'
]
[
'CPU'
][
'ALL'
][
event_type
][
'total_time'
]))
if
event_type_data
[
'calling_times'
][
'key'
]:
data
[
event_type
]
=
event_type_data
data
[
'order'
].
append
(
event_type
)
else
:
for
event_type
in
GPUType
:
event_type_data
=
{}
event_type_data
[
'calling_times'
]
=
{}
event_type_data
[
'calling_times'
][
'key'
]
=
[]
event_type_data
[
'calling_times'
][
'value'
]
=
[]
event_type_data
[
'durations'
]
=
{}
event_type_data
[
'durations'
][
'key'
]
=
[]
event_type_data
[
'durations'
][
'value'
]
=
[]
event_type_data
[
'ratios'
]
=
{}
event_type_data
[
'ratios'
][
'key'
]
=
[]
event_type_data
[
'ratios'
][
'value'
]
=
[]
for
stage_name
in
[
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
,
'Other'
]:
if
stage_name
in
self
.
merged_events_per_stage
:
if
event_type
in
self
.
merged_events_per_stage
[
stage_name
][
'GPU'
][
'ALL'
]:
event_type_data
[
'calling_times'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'durations'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'ratios'
][
'key'
].
append
(
stage_name
)
event_type_data
[
'calling_times'
][
'value'
].
append
(
self
.
merged_events_per_stage
[
stage_name
][
'GPU'
]
[
'ALL'
][
event_type
][
'calls'
])
event_type_data
[
'durations'
][
'value'
].
append
(
format_time
(
self
.
merged_events_per_stage
[
stage_name
]
[
'GPU'
][
'ALL'
][
event_type
][
'total_time'
],
time_unit
))
event_type_data
[
'ratios'
][
'value'
].
append
(
format_ratio
(
self
.
merged_events_per_stage
[
stage_name
]
[
'GPU'
][
'ALL'
][
event_type
][
'total_time'
]
/
self
.
merged_events_per_stage
[
'ProfileStep'
]
[
'GPU'
][
'ALL'
][
event_type
][
'total_time'
]))
if
event_type_data
[
'calling_times'
][
'key'
]:
data
[
event_type
]
=
event_type_data
data
[
'order'
].
append
(
event_type
)
return
data
def
get_event_type_model_perspective
(
self
,
time_unit
):
# noqa: C901
data
=
OrderedDict
()
data
[
'order'
]
=
[]
data
[
'phase_type'
]
=
[]
try
:
for
event_type
in
CPUType
:
if
event_type
in
self
.
merged_events_per_stage
[
'ProfileStep'
][
'CPU'
][
'ALL'
]:
data
[
'order'
].
append
(
event_type
)
data
[
event_type
]
=
[]
if
self
.
overview_parser
.
has_device
:
for
event_type
in
GPUType
:
if
event_type
in
self
.
merged_events_per_stage
[
'ProfileStep'
][
'GPU'
][
'ALL'
]:
data
[
'order'
].
append
(
event_type
)
data
[
event_type
]
=
[]
for
stage_name
in
[
'ProfileStep'
,
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
,
'Other'
]:
if
stage_name
in
self
.
merged_events_per_stage
:
data
[
'phase_type'
].
append
(
stage_name
)
for
event_type
in
data
[
'order'
]:
if
event_type
in
CPUType
:
if
event_type
in
self
.
merged_events_per_stage
[
stage_name
][
'CPU'
][
'ALL'
]:
data
[
event_type
].
append
(
format_time
(
self
.
merged_events_per_stage
[
stage_name
][
'CPU'
][
'ALL'
]
[
event_type
][
'total_time'
],
time_unit
))
else
:
data
[
event_type
].
append
(
0
)
elif
event_type
in
GPUType
:
if
event_type
in
self
.
merged_events_per_stage
[
stage_name
][
'GPU'
][
'ALL'
]:
data
[
event_type
].
append
(
format_time
(
self
.
merged_events_per_stage
[
stage_name
][
'GPU'
][
'ALL'
]
[
event_type
][
'total_time'
],
time_unit
))
else
:
data
[
event_type
].
append
(
0
)
newdata
=
OrderedDict
()
newdata
[
'order'
]
=
data
[
'order'
]
newdata
[
'phase_type'
]
=
data
[
'phase_type'
]
newdata
[
'data'
]
=
[]
for
key
in
newdata
[
'order'
]:
newdata
[
'data'
].
append
(
data
[
key
])
except
Exception
as
e
:
print
(
'error in get_event_type_model_perspective'
,
e
)
return
newdata
def
get_userdefined_perspective
(
self
,
time_unit
):
data
=
OrderedDict
()
if
self
.
overview_parser
.
has_device
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
,
'gpu_total_time'
,
'gpu_avg_time'
,
'gpu_max_time'
,
'gpu_min_time'
,
'gpu_ratio'
]
data
[
'has_gpu'
]
=
True
else
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
]
data
[
'has_gpu'
]
=
False
data
[
'events'
]
=
[]
total_cpu_time
=
0
total_gpu_time
=
0
for
name
,
event
in
self
.
userdefined_items
.
items
():
total_cpu_time
+=
event
.
cpu_time
total_gpu_time
+=
event
.
general_gpu_time
for
name
,
event
in
self
.
userdefined_items
.
items
():
if
self
.
overview_parser
.
has_device
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
})
return
data
def
get_operator_pie
(
self
,
topk
,
time_unit
=
'ms'
):
data
=
OrderedDict
()
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"ratio"
]
data
[
'cpu'
]
=
[]
if
self
.
has_gpu
:
data
[
'gpu'
]
=
[]
gpu_sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
general_gpu_time
,
reverse
=
True
)
cpu_sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
cpu_time
,
reverse
=
True
)
if
topk
<=
0
:
cpu_items
=
cpu_sorted_items
if
self
.
has_gpu
:
gpu_items
=
gpu_sorted_items
else
:
cpu_items
=
cpu_sorted_items
[:
topk
]
if
self
.
has_gpu
:
gpu_items
=
gpu_sorted_items
[:
topk
]
total_cpu_time
=
0.0
total_gpu_time
=
0.0
for
op_name
,
item
in
cpu_items
:
total_cpu_time
+=
item
.
cpu_time
if
self
.
has_gpu
:
for
op_name
,
item
in
gpu_items
:
total_gpu_time
+=
item
.
general_gpu_time
for
op_name
,
item
in
cpu_items
:
cpu_stage_data
=
OrderedDict
()
cpu_stage_data
[
'name'
]
=
op_name
cpu_stage_data
[
'calls'
]
=
item
.
call
cpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
cpu_time
,
time_unit
)
cpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_cpu_time
,
time_unit
)
cpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_cpu_time
,
time_unit
)
cpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_cpu_time
,
time_unit
)
cpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
cpu_time
/
total_cpu_time
)
data
[
'cpu'
].
append
(
cpu_stage_data
)
if
self
.
has_gpu
:
for
op_name
,
item
in
gpu_items
:
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
op_name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
general_gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_general_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_general_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_general_gpu_time
,
time_unit
)
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
general_gpu_time
/
total_gpu_time
)
data
[
'gpu'
].
append
(
gpu_stage_data
)
return
data
def
get_operator_pie_expand
(
# noqa: C901
self
,
topk
,
device_type
,
time_unit
):
data
=
OrderedDict
()
data
[
'order'
]
=
[]
data
[
'phase_type'
]
=
[]
data
[
'data'
]
=
[]
if
device_type
==
'cpu'
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
cpu_time
,
reverse
=
True
)
else
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
general_gpu_time
,
reverse
=
True
)
if
topk
<=
0
or
topk
>=
20
:
items
=
sorted_items
[:
20
]
other_items
=
sorted_items
[
20
:]
else
:
items
=
sorted_items
[:
topk
]
other_items
=
[]
data
[
'order'
].
extend
(
[
'infer_shape'
,
'compute'
,
'node_creation'
,
'others'
])
inner_op_data
=
defaultdict
(
list
)
for
op_name
,
event
in
items
:
data
[
'phase_type'
].
append
(
op_name
)
innerop_knownsub_times
=
0
have_innerop_name
=
set
()
for
innerop_name
,
item
in
event
.
operator_inners
.
items
():
if
'infer_shape'
in
innerop_name
or
'infer_meta'
in
innerop_name
:
innerop_name
=
'infer_shape'
elif
'compute'
in
innerop_name
:
innerop_name
=
'compute'
elif
'node_creation'
in
innerop_name
:
innerop_name
=
'node_creation'
else
:
continue
have_innerop_name
.
add
(
innerop_name
)
if
device_type
==
'cpu'
:
inner_op_data
[
innerop_name
].
append
(
format_time
(
item
.
cpu_time
,
time_unit
))
innerop_knownsub_times
+=
item
.
cpu_time
else
:
inner_op_data
[
innerop_name
].
append
(
format_time
(
item
.
general_gpu_time
,
time_unit
))
innerop_knownsub_times
+=
item
.
general_gpu_time
if
device_type
==
'cpu'
:
inner_op_data
[
'others'
].
append
(
format_time
(
event
.
cpu_time
-
innerop_knownsub_times
,
time_unit
))
else
:
inner_op_data
[
'others'
].
append
(
format_time
(
event
.
general_gpu_time
-
innerop_knownsub_times
,
time_unit
))
have_innerop_name
.
add
(
'others'
)
for
innerop_name
in
data
[
'order'
]:
if
innerop_name
in
have_innerop_name
:
continue
else
:
inner_op_data
[
innerop_name
].
append
(
0
)
if
other_items
:
innerop_knownsub_times
=
0
total_event_times
=
0
data
[
'phase_type'
].
append
(
'others'
)
others_time
=
defaultdict
(
float
)
for
op_name
,
event
in
other_items
:
for
innerop_name
,
item
in
event
.
operator_inners
.
items
():
if
'infer_shape'
in
innerop_name
:
innerop_name
=
'infer_shape'
elif
'compute'
in
innerop_name
:
innerop_name
=
'compute'
elif
'node_creation'
in
innerop_name
:
innerop_name
=
'node_creation'
else
:
continue
if
device_type
==
'cpu'
:
others_time
[
innerop_name
]
+=
item
.
cpu_time
innerop_knownsub_times
+=
item
.
cpu_time
else
:
others_time
[
innerop_name
]
+=
item
.
general_gpu_time
innerop_knownsub_times
+=
item
.
general_gpu_time
if
device_type
==
'cpu'
:
total_event_times
+=
event
.
cpu_time
else
:
total_event_times
+=
event
.
general_gpu_time
others_time
[
'others'
]
=
total_event_times
-
innerop_knownsub_times
for
innerop_name
in
data
[
'order'
]:
if
innerop_name
not
in
others_time
:
others_time
[
innerop_name
]
=
0.0
inner_op_data
[
innerop_name
].
append
(
format_time
(
others_time
[
innerop_name
],
time_unit
))
for
innerop_name
in
data
[
'order'
]:
data
[
'data'
].
append
(
inner_op_data
[
innerop_name
])
return
data
def
get_operator_table
(
# noqa: C901 Todo: Optimize code
self
,
group_by
=
'op_name'
,
search_name
=
None
,
time_unit
=
'ms'
):
def
get_children_data
(
event
):
datas
=
[]
for
innerop_name
,
item
in
event
.
operator_inners
.
items
():
if
item
.
cpu_time
==
0
:
cpu_ratio
=
0
else
:
cpu_ratio
=
float
(
item
.
cpu_time
)
/
event
.
cpu_time
if
item
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
event
.
general_gpu_time
data
=
{
"name"
:
innerop_name
,
"calls"
:
item
.
call
,
"cpu_total_time"
:
format_time
(
item
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
item
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
item
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
item
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
cpu_ratio
),
"gpu_total_time"
:
format_time
(
item
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
item
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
item
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
item
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
gpu_ratio
)
}
datas
.
append
(
data
)
return
datas
data
=
OrderedDict
()
data
[
'events'
]
=
[]
total_cpu_time
=
0
total_gpu_time
=
0
for
name
,
event
in
self
.
operator_items
.
items
():
total_cpu_time
+=
event
.
cpu_time
total_gpu_time
+=
event
.
general_gpu_time
if
not
search_name
:
if
group_by
==
'op_name'
:
if
self
.
has_gpu
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
,
'gpu_total_time'
,
'gpu_avg_time'
,
'gpu_max_time'
,
'gpu_min_time'
,
'gpu_ratio'
]
data
[
'has_gpu'
]
=
True
else
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
]
data
[
'has_gpu'
]
=
False
if
self
.
has_gpu
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
general_gpu_time
,
reverse
=
True
)
else
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
cpu_time
,
reverse
=
True
)
for
name
,
event
in
sorted_items
:
if
self
.
has_gpu
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
if
self
.
has_gpu
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'input_shape'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
,
'gpu_total_time'
,
'gpu_avg_time'
,
'gpu_max_time'
,
'gpu_min_time'
,
'gpu_ratio'
]
data
[
'has_gpu'
]
=
True
else
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'input_shape'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
]
data
[
'has_gpu'
]
=
False
new_arrange_data
=
{}
for
op_name
,
items_with_input_shape
in
self
.
operator_items_with_input_shape
.
items
(
):
for
input_shape
,
item
in
items_with_input_shape
.
items
():
new_arrange_data
[(
op_name
,
input_shape
)]
=
item
if
self
.
has_gpu
:
sorted_items
=
sorted
(
new_arrange_data
.
items
(),
key
=
lambda
x
:
x
[
1
].
general_gpu_time
,
reverse
=
True
)
else
:
sorted_items
=
sorted
(
new_arrange_data
.
items
(),
key
=
lambda
x
:
x
[
1
].
cpu_time
,
reverse
=
True
)
for
(
name
,
input_shape
),
event
in
sorted_items
:
if
not
input_shape
:
shape_string
=
[]
else
:
shapes
=
input_shape
.
split
(
'
\t
'
)[:
-
1
]
shape_string
=
[
'{}:{}'
.
format
(
*
shape
.
split
(
'-'
))
for
shape
in
shapes
]
if
self
.
has_gpu
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
name
,
"calls"
:
event
.
call
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
if
self
.
has_gpu
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
general_gpu_time
,
reverse
=
True
)
else
:
sorted_items
=
sorted
(
self
.
operator_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
cpu_time
,
reverse
=
True
)
results
=
[]
for
op_name
,
item
in
sorted_items
:
if
search_name
in
op_name
:
results
.
append
(
op_name
)
if
group_by
==
'op_name'
:
if
self
.
has_gpu
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
,
'gpu_total_time'
,
'gpu_avg_time'
,
'gpu_max_time'
,
'gpu_min_time'
,
'gpu_ratio'
]
data
[
'has_gpu'
]
=
True
else
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
]
data
[
'has_gpu'
]
=
False
for
op_name
in
results
:
event
=
self
.
operator_items
[
op_name
]
if
self
.
has_gpu
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
if
self
.
has_gpu
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'input_shape'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
,
'gpu_total_time'
,
'gpu_avg_time'
,
'gpu_max_time'
,
'gpu_min_time'
,
'gpu_ratio'
]
data
[
'has_gpu'
]
=
True
else
:
data
[
'column_name'
]
=
[
'name'
,
'calls'
,
'input_shape'
,
'cpu_total_time'
,
'cpu_avg_time'
,
'cpu_max_time'
,
'cpu_min_time'
,
'cpu_ratio'
]
data
[
'has_gpu'
]
=
False
for
op_name
in
results
:
for
input_shape
,
event
in
self
.
operator_items_with_input_shape
[
op_name
].
items
():
if
not
input_shape
:
shape_string
=
[]
else
:
shapes
=
input_shape
.
split
(
'
\t
'
)[:
-
1
]
shape_string
=
[
'{}:{}'
.
format
(
*
shape
.
split
(
'-'
))
for
shape
in
shapes
]
if
self
.
has_gpu
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"children"
:
children_events
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
),
"gpu_total_time"
:
format_time
(
event
.
general_gpu_time
,
time_unit
),
"gpu_avg_time"
:
format_time
(
event
.
avg_general_gpu_time
,
time_unit
),
"gpu_max_time"
:
format_time
(
event
.
max_general_gpu_time
,
time_unit
),
"gpu_min_time"
:
format_time
(
event
.
min_general_gpu_time
,
time_unit
),
"gpu_ratio"
:
format_ratio
(
event
.
general_gpu_time
/
total_gpu_time
if
total_gpu_time
!=
0
else
0.0
)
})
else
:
children_events
=
get_children_data
(
event
)
if
children_events
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"children"
:
get_children_data
(
event
),
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
else
:
data
[
'events'
].
append
({
"name"
:
op_name
,
"calls"
:
event
.
call
,
"input_shape"
:
shape_string
,
"cpu_total_time"
:
format_time
(
event
.
cpu_time
,
time_unit
),
"cpu_avg_time"
:
format_time
(
event
.
avg_cpu_time
,
time_unit
),
"cpu_max_time"
:
format_time
(
event
.
max_cpu_time
,
time_unit
),
"cpu_min_time"
:
format_time
(
event
.
min_cpu_time
,
time_unit
),
"cpu_ratio"
:
format_ratio
(
event
.
cpu_time
/
total_cpu_time
if
total_cpu_time
!=
0
else
0.0
)
})
return
data
def
get_kernel_pie
(
self
,
topk
,
time_unit
=
'ms'
):
data
=
OrderedDict
()
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"mean blocks per sm"
,
"mean est achieved occupancy"
,
"tensor core used"
,
"ratio"
]
data
[
'events'
]
=
[]
sorted_items
=
sorted
(
self
.
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
if
topk
<=
0
:
items
=
sorted_items
else
:
items
=
sorted_items
[:
topk
]
total_gpu_time
=
0.0
for
kernel_name
,
item
in
items
:
total_gpu_time
+=
item
.
gpu_time
for
kernel_name
,
item
in
items
:
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
kernel_name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_gpu_time
,
time_unit
)
gpu_stage_data
[
'mean blocks per sm'
]
=
format_float
(
item
.
sum_blocks_per_sm
/
item
.
call
)
gpu_stage_data
[
'mean est achieved occupancy'
]
=
format_float
(
item
.
sum_occupancy
/
item
.
call
)
gpu_stage_data
[
'tensor core used'
]
=
item
.
tensorcore_used
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
gpu_time
/
total_gpu_time
)
data
[
'events'
].
append
(
gpu_stage_data
)
return
data
def
get_kernel_table
(
self
,
group_by
=
''
,
search_name
=
None
,
time_unit
=
'ms'
):
data
=
OrderedDict
()
data
[
'events'
]
=
[]
total_gpu_time
=
0
for
name
,
event
in
self
.
kernel_items
.
items
():
total_gpu_time
+=
event
.
gpu_time
if
not
search_name
:
if
group_by
==
'kernel_name'
:
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"mean blocks per sm"
,
"mean est achieved occupancy"
,
"tensor core used"
,
"ratio"
]
sorted_items
=
sorted
(
self
.
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
for
name
,
item
in
sorted_items
:
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_gpu_time
,
time_unit
)
gpu_stage_data
[
'mean blocks per sm'
]
=
format_float
(
item
.
sum_blocks_per_sm
/
item
.
call
)
gpu_stage_data
[
'mean est achieved occupancy'
]
=
format_float
(
item
.
sum_occupancy
/
item
.
call
)
gpu_stage_data
[
'tensor core used'
]
=
item
.
tensorcore_used
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
gpu_time
/
total_gpu_time
)
data
[
'events'
].
append
(
gpu_stage_data
)
else
:
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"operator"
,
"grid"
,
"block"
,
"register per thread"
,
"shared memory"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"mean blocks per sm"
,
"mean est achieved occupancy"
,
"tensor core used"
,
"ratio"
]
new_arrange_data
=
{}
for
name
,
items_with_attributes
in
self
.
kernel_items_with_op_name_attributes
.
items
(
):
for
attributes
,
item
in
items_with_attributes
.
items
():
new_arrange_data
[(
name
,
attributes
)]
=
item
sorted_items
=
sorted
(
new_arrange_data
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
for
(
name
,
attributes
),
item
in
sorted_items
:
operator
,
grid
,
block
,
register_per_thread
,
shared_memory
=
attributes
.
split
(
'-'
)
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'operator'
]
=
operator
gpu_stage_data
[
'grid'
]
=
grid
gpu_stage_data
[
'block'
]
=
block
gpu_stage_data
[
'register per thread'
]
=
register_per_thread
gpu_stage_data
[
'shared memory'
]
=
shared_memory
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_gpu_time
,
time_unit
)
gpu_stage_data
[
'mean blocks per sm'
]
=
format_float
(
item
.
sum_blocks_per_sm
/
item
.
call
)
gpu_stage_data
[
'mean est achieved occupancy'
]
=
format_float
(
item
.
sum_occupancy
/
item
.
call
)
gpu_stage_data
[
'tensor core used'
]
=
item
.
tensorcore_used
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
gpu_time
/
total_gpu_time
)
data
[
'events'
].
append
(
gpu_stage_data
)
else
:
sorted_items
=
sorted
(
self
.
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
results
=
[]
for
kernel_name
,
item
in
sorted_items
:
if
search_name
in
kernel_name
:
results
.
append
(
kernel_name
)
if
group_by
==
'kernel_name'
:
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"total_time"
,
"avg_time"
,
"max_time"
,
"min_time"
,
"mean blocks per sm"
,
"mean est achieved occupancy"
,
"tensor core used"
,
"ratio"
]
for
kernel_name
in
results
:
item
=
self
.
kernel_items
[
kernel_name
]
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
kernel_name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_gpu_time
,
time_unit
)
gpu_stage_data
[
'mean blocks per sm'
]
=
format_float
(
item
.
sum_blocks_per_sm
/
item
.
call
)
gpu_stage_data
[
'mean est achieved occupancy'
]
=
format_float
(
item
.
sum_occupancy
/
item
.
call
)
gpu_stage_data
[
'tensor core used'
]
=
item
.
tensorcore_used
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
gpu_time
/
total_gpu_time
)
data
[
'events'
].
append
(
gpu_stage_data
)
else
:
for
kernel_name
in
results
:
for
items_with_attributes
,
item
in
self
.
kernel_items_with_op_name_attributes
[
kernel_name
].
items
():
operator
,
grid
,
block
,
register_per_thread
,
shared_memory
=
attributes
.
split
(
'-'
)
gpu_stage_data
=
OrderedDict
()
gpu_stage_data
[
'name'
]
=
kernel_name
gpu_stage_data
[
'calls'
]
=
item
.
call
gpu_stage_data
[
'operator'
]
=
operator
gpu_stage_data
[
'grid'
]
=
grid
gpu_stage_data
[
'block'
]
=
block
gpu_stage_data
[
'register per thread'
]
=
register_per_thread
gpu_stage_data
[
'shared memory'
]
=
shared_memory
gpu_stage_data
[
'total_time'
]
=
format_time
(
item
.
gpu_time
,
time_unit
)
gpu_stage_data
[
'avg_time'
]
=
format_time
(
item
.
avg_gpu_time
,
time_unit
)
gpu_stage_data
[
'max_time'
]
=
format_time
(
item
.
max_gpu_time
,
time_unit
)
gpu_stage_data
[
'min_time'
]
=
format_time
(
item
.
min_gpu_time
,
time_unit
)
gpu_stage_data
[
'mean blocks per sm'
]
=
format_float
(
item
.
sum_blocks_per_sm
/
item
.
call
)
gpu_stage_data
[
'mean est achieved occupancy'
]
=
format_float
(
item
.
sum_occupancy
/
item
.
call
)
gpu_stage_data
[
'tensor core used'
]
=
item
.
tensorcore_used
gpu_stage_data
[
'ratio'
]
=
format_ratio
(
item
.
gpu_time
/
total_gpu_time
)
data
[
'events'
].
append
(
gpu_stage_data
)
return
data
def
get_kernel_tc_pie
(
self
,
topk
,
time_unit
=
'ms'
):
data
=
OrderedDict
()
data
[
'column_name'
]
=
[
"name"
,
"calls"
,
"ratio"
]
data
[
'events'
]
=
[]
sorted_items
=
sorted
(
self
.
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
if
topk
<=
0
:
items
=
sorted_items
else
:
items
=
sorted_items
[:
topk
]
total_calls
=
0.0
tensorcore_calls
=
0.0
for
kernel_name
,
item
in
items
:
if
item
.
tensorcore_used
:
tensorcore_calls
+=
item
.
call
total_calls
+=
item
.
call
data
[
'events'
].
append
({
"name"
:
"Tensor core used"
,
"calls"
:
tensorcore_calls
,
"ratio"
:
format_ratio
(
tensorcore_calls
/
total_calls
)
})
data
[
'events'
].
append
({
"name"
:
"Tensor core unused"
,
"calls"
:
total_calls
-
tensorcore_calls
,
"ratio"
:
format_ratio
((
total_calls
-
tensorcore_calls
)
/
total_calls
)
})
return
data
def
get_trace_data
(
self
):
return
self
.
trace_parser
.
content
def
get_memory_devices
(
self
):
data
=
[]
for
device
in
self
.
memory_curve
.
keys
():
data
.
append
({
"device"
:
device
,
"min_size"
:
format_memory
(
self
.
size_ranges
[
device
][
0
],
'KB'
),
"max_size"
:
format_memory
(
self
.
size_ranges
[
device
][
1
],
'KB'
),
"max_allocation_size"
:
format_memory
(
self
.
peak_allocation_values
[
device
],
'KB'
),
})
return
data
def
get_memory_curve
(
self
,
device_type
,
time_unit
=
'ms'
):
curves
=
self
.
memory_curve
[
device_type
]
data
=
{}
data
[
'name'
]
=
{
'Allocated'
:
'已分配'
,
'Reserved'
:
'已预留'
,
'PeakAllocated'
:
'最大已分配'
,
'PeakReserved'
:
'最大已预留'
}
for
key
,
events
in
curves
.
items
():
data
[
key
]
=
[]
sorted_events
=
sorted
(
events
,
key
=
lambda
x
:
x
[
0
])
for
item
in
sorted_events
:
timestamp
=
item
[
0
]
size
=
item
[
1
]
event_name
=
item
[
2
]
data
[
key
].
append
([
format_time
(
timestamp
,
time_unit
),
format_memory
(
size
,
'KB'
),
event_name
])
return
data
def
get_memory_events
(
self
,
device_type
,
min_size
=
0
,
max_size
=
float
(
'inf'
),
search_name
=
None
,
time_unit
=
'ms'
):
data
=
{}
data
[
'column_name'
]
=
[
'MemoryAddr'
,
'MemoryType'
,
'AllocatedEvent'
,
'AllocatedTimestamp'
,
'FreeEvent'
,
'FreeTimestamp'
,
'Duration'
,
'Size'
]
data
[
'data'
]
=
[]
paired_event_list
=
self
.
paired_events
[
device_type
]
def
filter_func
(
item
):
nonlocal
min_size
nonlocal
max_size
nonlocal
search_name
size
=
format_memory
(
item
[
-
1
],
'KB'
)
if
not
search_name
:
if
size
>=
min_size
and
size
<=
max_size
:
return
True
else
:
if
size
>=
min_size
and
size
<=
max_size
:
if
item
[
2
]:
if
search_name
in
item
[
2
]:
return
True
if
item
[
4
]:
if
search_name
in
item
[
4
]:
return
True
return
False
paired_event_list
=
filter
(
filter_func
,
paired_event_list
)
paired_event_list
=
sorted
(
paired_event_list
,
key
=
lambda
x
:
x
[
-
1
])
if
not
paired_event_list
:
return
data
duration
=
None
for
item
in
paired_event_list
:
if
item
[
3
]
and
item
[
5
]:
duration
=
item
[
5
]
-
item
[
3
]
else
:
duration
=
None
data
[
'data'
].
append
({
"MemoryAddr"
:
item
[
0
],
"MemoryType"
:
item
[
1
],
"AllocatedEvent"
:
item
[
2
],
"AllocatedTimestamp"
:
format_time
(
item
[
3
],
time_unit
)
if
item
[
3
]
else
None
,
"FreeEvent"
:
item
[
4
],
"FreeTimestamp"
:
format_time
(
item
[
5
],
time_unit
)
if
item
[
5
]
else
None
,
"Duration"
:
format_time
(
duration
,
time_unit
)
if
duration
is
not
None
else
None
,
"Size"
:
format_memory
(
item
[
6
],
'KB'
)
})
return
data
def
get_op_memory_events
(
self
,
device_type
,
search_name
=
None
):
data
=
{}
data
[
'column_name'
]
=
[
'EventName'
,
'MemoryType'
,
'AllocationCount'
,
'FreeCount'
,
'AllocationSize'
,
'FreeSize'
,
'IncreasedSize'
]
data
[
'data'
]
=
[]
allocated_events
=
self
.
allocated_items
[
device_type
]
def
filter_func
(
item
):
nonlocal
search_name
if
not
search_name
:
return
True
else
:
if
search_name
in
item
[
0
]:
return
True
return
False
reserved_events
=
self
.
reserved_items
[
device_type
]
all_events
=
[(
key
,
item
)
for
key
,
item
in
allocated_events
.
items
()]
all_events
.
extend
(
[(
key
,
item
)
for
key
,
item
in
reserved_events
.
items
()])
if
search_name
:
all_events
=
filter
(
filter_func
,
all_events
)
sorted_items
=
sorted
(
all_events
,
key
=
lambda
x
:
x
[
1
].
increase_size
,
reverse
=
True
)
if
not
sorted_items
:
return
data
for
event_name
,
item
in
sorted_items
:
data
[
'data'
].
append
({
'EventName'
:
event_name
,
'MemoryType'
:
item
.
memory_type
,
'AllocationCount'
:
item
.
allocation_count
,
'FreeCount'
:
item
.
free_count
,
'AllocationSize'
:
format_memory
(
item
.
allocation_size
,
'KB'
),
'FreeSize'
:
format_memory
(
item
.
free_size
,
'KB'
),
'IncreasedSize'
:
format_memory
(
item
.
increase_size
,
'KB'
)
})
return
data
class
DistributedProfilerData
:
'''
Hold data for distributed view.
Aggregate all data for distributed in ProfileData object.
'''
def
__init__
(
self
,
run
,
span
,
profile_datas
):
self
.
run
=
run
self
.
span
=
span
self
.
profile_datas
=
profile_datas
def
get_distributed_info
(
self
):
data
=
[]
for
profile_data
in
self
.
profile_datas
:
device_infos
=
profile_data
.
device_infos
gpu_id
=
int
(
next
(
iter
(
profile_data
.
gpu_ids
)))
data
.
append
({
'worker_name'
:
profile_data
.
worker_name
,
'process_id'
:
'pid: {}'
.
format
(
profile_data
.
process_id
),
'device_id'
:
'GPU{}'
.
format
(
gpu_id
),
'name'
:
device_infos
[
gpu_id
][
'name'
],
'memory'
:
"{} GB"
.
format
(
format_memory
(
device_infos
[
gpu_id
][
'totalGlobalMem'
],
'GB'
)),
'computeCapability'
:
'{}.{}'
.
format
(
device_infos
[
gpu_id
][
'computeMajor'
],
device_infos
[
gpu_id
][
'computeMinor'
]),
'utilization'
:
'{}%'
.
format
(
format_ratio
(
profile_data
.
gpu_ulitization
))
})
return
data
def
get_distributed_histogram
(
self
,
step
,
time_unit
=
'ms'
):
data
=
{}
data
[
'order'
]
=
[
"ProfileStep"
,
"Communication"
,
"Computation"
,
"Overlap"
,
"Others"
]
data
[
'worker_name'
]
=
[]
data
[
'data'
]
=
[]
new_data
=
defaultdict
(
list
)
for
profile_data
in
self
.
profile_datas
:
data
[
'worker_name'
].
append
(
profile_data
.
worker_name
)
if
step
!=
'All'
:
new_data
[
'ProfileStep'
].
append
(
format_time
(
profile_data
.
model_perspective_items
[
'ProfileStep'
].
cpu_times
[
step
],
time_unit
))
else
:
new_data
[
'ProfileStep'
].
append
(
format_time
(
profile_data
.
model_perspective_items
[
'ProfileStep'
].
cpu_time
,
time_unit
))
new_data
[
'Communication'
].
append
(
format_time
(
profile_data
.
distributed_time
[
step
][
'communication_time'
],
time_unit
))
new_data
[
'Computation'
].
append
(
format_time
(
profile_data
.
distributed_time
[
step
][
'computation_time'
],
time_unit
))
new_data
[
'Overlap'
].
append
(
format_time
(
profile_data
.
distributed_time
[
step
][
'overlap_time'
],
time_unit
))
new_data
[
'Others'
].
append
(
format_time
(
profile_data
.
distributed_time
[
step
][
'others_time'
],
time_unit
))
for
order
in
data
[
'order'
]:
data
[
'data'
].
append
(
new_data
[
order
])
return
data
def
get_distributed_steps
(
self
):
for
profile_data
in
self
.
profile_datas
:
steps
=
list
(
profile_data
.
distributed_time
.
keys
())
final_steps
=
[
'All'
]
+
sorted
(
[
int
(
step
)
for
step
in
steps
if
step
!=
'All'
])
return
final_steps
visualdl/component/profiler/profiler_reader.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
os
import
re
from
threading
import
Thread
from
multiprocess
import
Process
from
multiprocess
import
Queue
from
.parser.const_description
import
*
# noqa: F403
from
.parser.event_node
import
load_profiler_json
from
.run_manager
import
RunManager
from
visualdl.io
import
bfile
_name_pattern
=
re
.
compile
(
r
"(.+)_time_(.+)\.paddle_trace\.((pb)|(json))"
)
def
is_VDLProfiler_file
(
path
):
"""Determine whether it is a paddle profile file that can be read by vdl according to the file name.
File name of a paddle profile file must contain `paddle_trace`.
Args:
path: File name to determine.
Returns:
True if the file is a paddle profile file, otherwise false.
"""
if
"paddle_trace"
not
in
path
:
return
False
return
True
class
ProfilerReader
(
object
):
"""Profile reader to read paddle profile files, support for frontend api in lib.py.
"""
def
__init__
(
self
,
logdir
=
''
):
"""Instance of ProfileReader
Args:
logdir: The dir include paddle profile files, multiple subfolders allowed.
"""
if
isinstance
(
logdir
,
str
):
self
.
dir
=
[
logdir
]
else
:
self
.
dir
=
logdir
self
.
walks
=
{}
self
.
displayname2runs
=
{}
self
.
runs2displayname
=
{}
self
.
run_managers
=
{}
self
.
profile_result_queue
=
Queue
()
self
.
tempfile
=
None
self
.
runs
()
Thread
(
target
=
self
.
_get_data_from_queue
,
args
=
()).
start
()
@
property
def
logdir
(
self
):
return
self
.
dir
def
get_all_walk
(
self
):
flush_walks
=
{}
for
dir
in
self
.
dir
:
for
root
,
dirs
,
files
in
bfile
.
walk
(
dir
):
flush_walks
.
update
({
root
:
files
})
return
flush_walks
def
get_run_manager
(
self
,
run
):
if
run
in
self
.
run_managers
:
self
.
run_managers
[
run
].
join
()
return
self
.
run_managers
[
run
]
else
:
return
None
def
profile_runs
(
self
,
update
=
False
):
"""Get profile run files.
Every dir(means `run` in vdl) has may have more than one profiler file.
Returns:
walks: A dict like {"exp1": ["1587375595_paddle_trace.json", "1587375685_paddle_trace.json"],
"exp2": ["1587375686_paddle_trace.json"]}
"""
if
not
self
.
walks
or
update
is
True
:
flush_walks
=
self
.
get_all_walk
()
walks_temp
=
{}
for
run
,
filenames
in
flush_walks
.
items
():
tags_temp
=
[
filename
for
filename
in
filenames
if
is_VDLProfiler_file
(
filename
)
]
if
len
(
tags_temp
)
>
0
:
walks_temp
.
update
({
run
:
tags_temp
})
self
.
walks
=
walks_temp
return
self
.
walks
def
runs
(
self
,
update
=
True
):
self
.
profile_runs
(
update
=
update
)
for
run
,
filenames
in
self
.
walks
.
items
():
if
run
not
in
self
.
run_managers
:
self
.
run_managers
[
run
]
=
RunManager
(
run
)
self
.
run_managers
[
run
].
set_all_filenames
(
filenames
)
for
filename
in
filenames
:
if
self
.
run_managers
[
run
].
has_handled
(
filename
):
continue
self
.
_read_data
(
run
,
filename
)
return
list
(
self
.
walks
.
keys
())
def
get_descriptions
(
self
,
lang
):
if
lang
==
'zh'
:
return
{
"overview_environment"
:
TOOLTIP_DEVICE_INFO_CN
,
# noqa: F405
"overview_model_perspective"
:
TOOLTIP_MODEL_PERSPECTIVE_CN
,
# noqa: F405
"overview_model_perspective_perstep"
:
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_CN
,
# noqa: F405
"overview_event_type_perspective"
:
TOOLTIP_EVENT_TYPE_PERSPECTIVE_CN
,
# noqa: F405
"overview_event_type_model_perspective"
:
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_CN
,
# noqa: F405
"distributed_histogram"
:
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_CN
# noqa: F405
}
else
:
return
{
"overview_environment"
:
TOOLTIP_DEVICE_INFO_EN
,
# noqa: F405
"overview_model_perspective"
:
TOOLTIP_MODEL_PERSPECTIVE_EN
,
# noqa: F405
"overview_model_perspective_perstep"
:
TOOLTIP_MODEL_PERSPECTIVE_PERSTEP_EN
,
# noqa: F405
"overview_event_type_perspective"
:
TOOLTIP_EVENT_TYPE_PERSPECTIVE_EN
,
# noqa: F405
"overview_event_type_model_perspective"
:
TOOLTIP_EVENT_TYPE_MODEL_PERSPECTIVE_EN
,
# noqa: F405
"distributed_histogram"
:
TOOLTIP_EVENT_DISTRIBUTED_HISTOGRAM_EN
# noqa: F405
}
def
set_displayname
(
self
,
log_reader
):
self
.
displayname2runs
=
log_reader
.
name2tags
self
.
runs2displayname
=
log_reader
.
tags2name
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
pass
def
_get_data_from_queue
(
self
):
while
True
:
try
:
run
,
filename
,
worker_name
,
profile_result
=
self
.
profile_result_queue
.
get
(
)
self
.
run_managers
[
run
].
add_profile_result
(
filename
,
worker_name
,
profile_result
)
except
Exception
as
e
:
print
(
'Read profiler data error in multiprocess, error: {}'
.
format
(
e
))
def
_read_data
(
self
,
run
,
filename
):
match
=
_name_pattern
.
match
(
filename
)
if
match
:
worker_name
=
match
.
group
(
1
)
if
'.pb'
in
filename
:
try
:
from
paddle.profiler
import
load_profiler_result
except
Exception
:
print
(
'Load paddle.profiler error. Please check paddle >= 2.3.0'
)
exit
(
0
)
profile_result
=
load_profiler_result
(
os
.
path
.
join
(
run
,
filename
))
self
.
run_managers
[
run
].
add_profile_result
(
filename
,
worker_name
,
profile_result
)
else
:
def
_load_profiler_json
(
run
,
filename
,
worker_name
):
profile_result
=
load_profiler_json
(
os
.
path
.
join
(
run
,
filename
))
self
.
profile_result_queue
.
put
((
run
,
filename
,
worker_name
,
profile_result
))
Process
(
target
=
_load_profiler_json
,
args
=
(
run
,
filename
,
worker_name
)).
start
()
visualdl/component/profiler/profiler_server.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
import
json
from
.profiler_reader
import
ProfilerReader
from
visualdl.server.api
import
gen_result
from
visualdl.server.api
import
result
class
ProfilerApi
(
object
):
def
__init__
(
self
,
logdir
):
self
.
_reader
=
ProfilerReader
(
logdir
)
@
result
()
def
runs
(
self
):
return
self
.
_reader
.
runs
()
@
result
()
def
views
(
self
,
run
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
if
run_manager
is
None
:
return
[]
return
list
(
run_manager
.
get_views
())
@
result
()
def
workers
(
self
,
run
,
view
):
if
view
==
'Distributed'
:
return
[
'All'
]
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
return
run_manager
.
get_workers
(
view
)
@
result
()
def
spans
(
self
,
run
,
worker
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
if
worker
==
'All'
:
return
run_manager
.
get_distributed_spans
()
return
run_manager
.
get_spans
(
worker
)
@
result
()
def
timeunits
(
self
):
return
[
'ns'
,
'us'
,
'ms'
,
's'
]
@
result
()
def
descriptions
(
self
,
lang
):
if
lang
==
'undefined'
or
lang
is
None
:
lang
=
'zh'
lang
=
lang
.
lower
()
return
self
.
_reader
.
get_descriptions
(
lang
)
@
result
()
def
overview_environment
(
self
,
run
,
worker
,
span
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
span
=
str
(
span
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
result
=
profiler_data
.
get_device_infos
()
num_workers
=
len
(
run_manager
.
get_workers
(
'Overview'
))
result
[
'num_workers'
]
=
num_workers
return
result
@
result
()
def
model_perspective
(
self
,
run
,
worker
,
span
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_model_perspective
(
time_unit
)
@
result
()
def
model_perspective_perstep
(
self
,
run
,
worker
,
span
,
device_type
,
time_unit
=
'ms'
):
device_type
=
device_type
.
lower
()
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_model_perspective_perstep
(
device_type
,
time_unit
)
@
result
()
def
event_type_perspective
(
self
,
run
,
worker
,
span
,
device_type
,
time_unit
=
'ms'
):
device_type
=
device_type
.
lower
()
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_event_type_perspective
(
device_type
,
time_unit
)
@
result
()
def
event_type_model_perspective
(
self
,
run
,
worker
,
span
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_event_type_model_perspective
(
time_unit
)
@
result
()
def
userdefined_perspective
(
self
,
run
,
worker
,
span
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_userdefined_perspective
(
time_unit
)
@
result
()
def
operator_pie
(
self
,
run
,
worker
,
span
,
topk
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
topk
=
int
(
topk
)
return
profiler_data
.
get_operator_pie
(
topk
,
time_unit
)
@
result
()
def
operator_pie_expand
(
self
,
run
,
worker
,
span
,
topk
,
device_type
,
time_unit
):
device_type
=
device_type
.
lower
()
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
topk
=
int
(
topk
)
return
profiler_data
.
get_operator_pie_expand
(
topk
,
device_type
,
time_unit
)
@
result
()
def
operator_table
(
self
,
run
,
worker
,
span
,
group_by
,
search_name
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_operator_table
(
group_by
,
search_name
,
time_unit
)
@
result
()
def
operator_stack_table
(
self
,
run
,
worker
,
span
,
op_name
,
group_by
,
input_shape
,
time_unit
=
'ms'
):
pass
@
result
()
def
kernel_pie
(
self
,
run
,
worker
,
span
,
topk
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
topk
=
int
(
topk
)
return
profiler_data
.
get_kernel_pie
(
topk
,
time_unit
)
@
result
()
def
kernel_table
(
self
,
run
,
worker
,
span
,
group_by
,
search_name
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_kernel_table
(
group_by
,
search_name
,
time_unit
)
@
result
()
def
kernel_tc_pie
(
self
,
run
,
worker
,
span
,
topk
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
topk
=
int
(
topk
)
return
profiler_data
.
get_kernel_tc_pie
(
topk
,
time_unit
)
@
result
()
def
distributed_info
(
self
,
run
,
worker
,
span
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
distributed_profiler_data
=
run_manager
.
get_distributed_profiler_data
(
span
)
if
distributed_profiler_data
is
None
:
return
return
distributed_profiler_data
.
get_distributed_info
()
@
result
()
def
distributed_steps
(
self
,
run
,
worker
,
span
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
distributed_profiler_data
=
run_manager
.
get_distributed_profiler_data
(
span
)
return
distributed_profiler_data
.
get_distributed_steps
()
@
result
()
def
distributed_histogram
(
self
,
run
,
worker
,
span
,
step
,
time_unit
=
'ms'
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
distributed_profiler_data
=
run_manager
.
get_distributed_profiler_data
(
span
)
return
distributed_profiler_data
.
get_distributed_histogram
(
step
,
time_unit
)
@
result
(
headers
=
{
'content-encoding'
:
'gzip'
})
def
trace
(
self
,
run
,
worker
,
span
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_trace_data
()
@
result
()
def
memory_devices
(
self
,
run
,
worker
,
span
):
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_memory_devices
()
@
result
(
headers
=
{
'content-encoding'
:
'gzip'
})
def
memory_curve
(
self
,
run
,
worker
,
span
,
device_type
,
time_unit
=
'ms'
):
if
device_type
==
'undefined'
:
return
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_memory_curve
(
device_type
,
time_unit
)
@
result
(
headers
=
{
'content-encoding'
:
'gzip'
})
def
memory_events
(
self
,
run
,
worker
,
span
,
device_type
,
min_size
=
0
,
max_size
=
float
(
'inf'
),
search_name
=
None
,
time_unit
=
'ms'
):
if
device_type
==
'undefined'
:
return
try
:
min_size
=
float
(
min_size
)
except
Exception
:
min_size
=
0
try
:
max_size
=
float
(
max_size
)
except
Exception
:
max_size
=
float
(
'inf'
)
if
search_name
==
'undefined'
or
not
search_name
:
search_name
=
None
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_memory_events
(
device_type
,
min_size
,
max_size
,
search_name
,
time_unit
)
@
result
(
headers
=
{
'content-encoding'
:
'gzip'
})
def
op_memory_events
(
self
,
run
,
worker
,
span
,
device_type
,
search_name
=
None
):
if
search_name
==
'undefined'
or
not
search_name
:
search_name
=
None
if
device_type
==
'undefined'
:
return
run_manager
=
self
.
_reader
.
get_run_manager
(
run
)
profiler_data
=
run_manager
.
get_profiler_data
(
worker
,
span
)
return
profiler_data
.
get_op_memory_events
(
device_type
,
search_name
)
@
result
()
def
comparison_phase
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
):
pass
@
result
()
def
comparison_phase_diff
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
):
pass
@
result
()
def
comparison_phase_table
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
):
pass
@
result
()
def
comparison_phase_inner
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
,
phase_name
):
pass
@
result
()
def
comparison_phase_diff_inner
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
,
phase_name
):
pass
@
result
()
def
comparison_phase_table_inner
(
self
,
base_run
,
base_worker
,
base_span
,
exp_run
,
exp_worker
,
exp_span
,
phase_name
):
pass
def
create_profiler_api_call
(
logdir
):
api
=
ProfilerApi
(
logdir
)
routes
=
{
'runs'
:
(
api
.
runs
,
[]),
'views'
:
(
api
.
views
,
[
"run"
]),
'workers'
:
(
api
.
workers
,
[
"run"
,
"view"
]),
'spans'
:
(
api
.
spans
,
[
"run"
,
"worker"
]),
'timeunits'
:
(
api
.
timeunits
,
[]),
'descriptions'
:
(
api
.
descriptions
,
[
"lang"
]),
'overview/environment'
:
(
api
.
overview_environment
,
[
"run"
,
"worker"
,
"span"
]),
'overview/model_perspective'
:
(
api
.
model_perspective
,
[
"run"
,
"worker"
,
"span"
,
"time_unit"
]),
'overview/model_perspective_perstep'
:
(
api
.
model_perspective_perstep
,
[
"run"
,
"worker"
,
"span"
,
"device_type"
,
"time_unit"
]),
'overview/event_type_perspective'
:
(
api
.
event_type_perspective
,
[
"run"
,
"worker"
,
"span"
,
"device_type"
,
"time_unit"
]),
'overview/event_type_model_perspective'
:
(
api
.
event_type_model_perspective
,
[
"run"
,
"worker"
,
"span"
,
"time_unit"
]),
'overview/userdefined_perspective'
:
(
api
.
userdefined_perspective
,
[
"run"
,
"worker"
,
"span"
,
"time_unit"
]),
'operator/pie'
:
(
api
.
operator_pie
,
[
"run"
,
"worker"
,
"span"
,
"topk"
,
"time_unit"
]),
'operator/pie_expand'
:
(
api
.
operator_pie_expand
,
[
"run"
,
"worker"
,
"span"
,
"topk"
,
"device_type"
,
"time_unit"
]),
'operator/table'
:
(
api
.
operator_table
,
[
"run"
,
"worker"
,
"span"
,
"group_by"
,
"search_name"
,
"time_unit"
]),
'operator/stack_table'
:
(
api
.
operator_stack_table
,
[
"run"
,
"worker"
,
"span"
,
"op_name"
,
"group_by"
,
"time_unit"
"input_shape"
]),
'kernel/pie'
:
(
api
.
kernel_pie
,
[
"run"
,
"worker"
,
"span"
,
"topk"
,
"time_unit"
]),
'kernel/tensorcore_pie'
:
(
api
.
kernel_tc_pie
,
[
"run"
,
"worker"
,
"span"
,
"topk"
,
"time_unit"
]),
'kernel/table'
:
(
api
.
kernel_table
,
[
"run"
,
"worker"
,
"span"
,
"group_by"
,
"search_name"
,
"time_unit"
]),
'distributed/info'
:
(
api
.
distributed_info
,
[
"run"
,
"worker"
,
"span"
]),
'distributed/steps'
:
(
api
.
distributed_steps
,
[
"run"
,
"worker"
,
"span"
]),
'distributed/histogram'
:
(
api
.
distributed_histogram
,
[
"run"
,
"worker"
,
"span"
,
"step"
,
"time_unit"
]),
'trace'
:
(
api
.
trace
,
[
"run"
,
"worker"
,
"span"
]),
'memory/devices'
:
(
api
.
memory_devices
,
[
"run"
,
"worker"
,
"span"
]),
'memory/curve'
:
(
api
.
memory_curve
,
[
"run"
,
"worker"
,
"span"
,
"device_type"
,
"time_unit"
]),
'memory/memory_events'
:
(
api
.
memory_events
,
[
"run"
,
"worker"
,
"span"
,
"device_type"
,
"min_size"
,
"max_size"
,
"search_name"
,
"time_unit"
]),
'memory/op_memory_events'
:
(
api
.
op_memory_events
,
[
"run"
,
"worker"
,
"span"
,
"device_type"
,
"search_name"
]),
'comparison/phase'
:
(
api
.
comparison_phase
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
]),
'comparison/phase_diff'
:
(
api
.
comparison_phase_diff
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
]),
'comparison/phase_table'
:
(
api
.
comparison_phase_table
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
]),
'comparison/phase_inner'
:
(
api
.
comparison_phase_inner
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
,
"phase_name"
]),
'comparison/phase_diff_inner'
:
(
api
.
comparison_phase_diff_inner
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
,
"phase_name"
]),
'comparison/phase_table_inner'
:
(
api
.
comparison_phase_table_inner
,
[
"base_run"
,
"base_worker"
,
"base_span"
,
"exp_run"
,
"exp_worker"
,
"exp_span"
,
"phase_name"
])
}
def
call
(
path
:
str
,
args
):
route
=
routes
.
get
(
path
)
if
not
route
:
return
json
.
dumps
(
gen_result
(
status
=
1
,
msg
=
'api not found'
)),
'application/json'
,
None
method
,
call_arg_names
=
route
call_args
=
[
args
.
get
(
name
)
for
name
in
call_arg_names
]
return
method
(
*
call_args
)
return
call
visualdl/component/profiler/run_manager.py
0 → 100644
浏览文件 @
d251028d
# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======================================================================
from
collections
import
defaultdict
from
threading
import
Thread
from
.profiler_data
import
DistributedProfilerData
from
.profiler_data
import
ProfilerData
class
RunManager
:
'''
Manage profile data for each run, each run may have multiple workers and spans.
We should manage profile data of each (worker, span) unit.
Besides, a special worker "all" is created to merge all profile data for distributed view.
'''
def
__init__
(
self
,
run
):
self
.
run
=
run
# worker:
# span:
# ProfileData
self
.
profiler_data
=
defaultdict
(
dict
)
self
.
all_filenames
=
set
()
self
.
handled_filenames
=
set
()
# span:
# DistributedProfileData
self
.
distributed_data
=
{}
self
.
threads
=
{}
self
.
has_join
=
False
def
get_profiler_data
(
self
,
worker
,
span
):
if
worker
in
self
.
profiler_data
:
if
span
in
self
.
profiler_data
[
worker
]:
return
self
.
profiler_data
[
worker
][
span
]
def
get_distributed_profiler_data
(
self
,
span
):
if
span
in
self
.
distributed_data
:
return
self
.
distributed_data
[
span
]
def
get_views
(
self
):
'''
Return all views supported in current run data.
'''
all_views
=
set
()
for
worker
,
span_data
in
self
.
profiler_data
.
items
():
for
span
,
profiler_data
in
span_data
.
items
():
all_views
.
update
(
profiler_data
.
get_views
())
ordered_views
=
[
'Overview'
,
'Operator'
,
'GPU Kernel'
,
'Distributed'
,
'Trace'
,
'Memory'
]
final_views
=
[]
for
view
in
ordered_views
:
if
view
in
all_views
:
final_views
.
append
(
view
)
return
final_views
def
get_workers
(
self
,
view_name
):
'''
Return all workers(processes) in current run data.
'''
workers
=
[]
for
worker
,
span_data
in
self
.
profiler_data
.
items
():
for
span
,
profiler_data
in
span_data
.
items
():
if
view_name
in
profiler_data
.
get_views
():
workers
.
append
(
worker
)
break
return
workers
def
get_spans
(
self
,
worker_name
):
'''
Return all spans in current run data.
spans: Collecting profile data when training your model can be divided into several parts supported by
\
paddle.profiler api, for example, you may profile steps 2-4, 6-8. Each range is called a span here.
\
And We index each span by orders.
'''
spans
=
list
(
self
.
profiler_data
[
worker_name
].
keys
())
spans
=
sorted
([
int
(
span
)
for
span
in
spans
])
spans
=
[
str
(
span
)
for
span
in
spans
]
return
spans
def
get_distributed_spans
(
self
):
spans
=
list
(
self
.
distributed_data
.
keys
())
spans
=
sorted
([
int
(
span
)
for
span
in
spans
])
spans
=
[
str
(
span
)
for
span
in
spans
]
return
spans
def
_parse_file
(
self
,
worker_name
,
result
):
span
=
result
.
get_span_idx
()
self
.
profiler_data
[
worker_name
][
span
]
=
ProfilerData
(
self
.
run
,
worker_name
,
span
,
result
)
return
def
join
(
self
):
if
self
.
has_join
:
return
for
thread
in
self
.
threads
.
values
():
thread
.
join
()
self
.
has_join
=
True
distributed_profiler_data
=
defaultdict
(
list
)
for
worker_name
,
span_data
in
self
.
profiler_data
.
items
():
for
span_idx
,
profiler_data
in
span_data
.
items
():
distributed_profiler_data
[
span_idx
].
append
(
profiler_data
)
for
span_idx
,
profiler_datas
in
distributed_profiler_data
.
items
():
self
.
distributed_data
[
span_idx
]
=
DistributedProfilerData
(
self
.
run
,
span_idx
,
profiler_datas
)
def
add_profile_result
(
self
,
filename
,
worker_name
,
profile_result
):
thread
=
Thread
(
target
=
self
.
_parse_file
,
args
=
(
worker_name
,
profile_result
))
thread
.
start
()
self
.
handled_filenames
.
add
(
filename
)
self
.
threads
[
filename
]
=
thread
def
set_all_filenames
(
self
,
filenames
):
self
.
all_filenames
.
update
(
filenames
)
def
has_handled
(
self
,
filename
):
if
filename
in
self
.
handled_filenames
:
return
True
else
:
return
False
visualdl/server/api.py
浏览文件 @
d251028d
...
...
@@ -14,8 +14,10 @@
# limitations under the License.
# =======================================================================
import
functools
import
gzip
import
json
import
os
from
io
import
BytesIO
from
flask
import
request
...
...
@@ -52,6 +54,13 @@ def result(mimetype='application/json', headers=None):
headers_output
=
headers
(
self
)
else
:
headers_output
=
headers
if
headers
is
not
None
:
if
'content-encoding'
in
headers
:
buf
=
BytesIO
()
with
gzip
.
GzipFile
(
mode
=
'wb'
,
fileobj
=
buf
)
as
fp
:
gzip_value
=
data
.
encode
()
fp
.
write
(
gzip_value
)
data
=
buf
.
getvalue
()
return
data
,
mimetype
,
headers_output
return
wrapper
...
...
visualdl/server/app.py
浏览文件 @
d251028d
...
...
@@ -32,6 +32,7 @@ from flask_babel import Babel
import
visualdl.server
from
visualdl
import
__version__
from
visualdl.component.profiler.profiler_server
import
create_profiler_api_call
from
visualdl.server.api
import
create_api_call
from
visualdl.server.args
import
parse_args
from
visualdl.server.args
import
ParseArgs
...
...
@@ -52,7 +53,7 @@ mock_data_path = os.path.join(SERVER_DIR, "./mock_data/")
check_live_path
=
'/alive'
def
create_app
(
args
):
def
create_app
(
args
):
# noqa: C901
# disable warning from flask
cli
=
sys
.
modules
[
'flask.cli'
]
cli
.
show_server_banner
=
lambda
*
x
:
None
...
...
@@ -66,7 +67,7 @@ def create_app(args):
app
.
config
[
'BABEL_DEFAULT_LOCALE'
]
=
default_language
babel
=
Babel
(
app
)
api_call
=
create_api_call
(
args
.
logdir
,
args
.
model
,
args
.
cache_timeout
)
profiler_api_call
=
create_profiler_api_call
(
args
.
logdir
)
if
args
.
telemetry
:
update_util
.
PbUpdater
(
args
.
product
).
start
()
...
...
@@ -134,6 +135,12 @@ def create_app(args):
return
make_response
(
Response
(
data
,
mimetype
=
mimetype
,
headers
=
headers
))
@
app
.
route
(
api_path
+
'/profiler/<path:method>'
,
methods
=
[
"GET"
,
"POST"
])
def
serve_profiler_api
(
method
):
data
,
mimetype
,
headers
=
profiler_api_call
(
method
,
request
.
args
)
return
make_response
(
Response
(
data
,
mimetype
=
mimetype
,
headers
=
headers
))
@
app
.
route
(
check_live_path
)
def
check_live
():
return
''
,
204
...
...
visualdl/version.py
浏览文件 @
d251028d
...
...
@@ -13,4 +13,4 @@
# limitations under the License.
# =======================================================================
vdl_version
=
'2.
3
.0'
vdl_version
=
'2.
4
.0'
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录