Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
9b54bf93
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9b54bf93
编写于
4月 19, 2022
作者:
C
chenjian
提交者:
GitHub
4月 19, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Rebase for profiler statistic ratio (#41939)
* fix according to suggestion * add kernel summary * improve coverage
上级
2b55290e
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
273 addition
and
45 deletion
+273
-45
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
...n/paddle/fluid/tests/unittests/test_profiler_statistic.py
+92
-10
python/paddle/profiler/profiler_statistic.py
python/paddle/profiler/profiler_statistic.py
+181
-35
未找到文件。
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
浏览文件 @
9b54bf93
...
...
@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase):
profilerstep_node
=
HostPythonNode
(
'ProfileStep#1'
,
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Forward
,
5
,
15
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
yolonet_node
=
HostPythonNode
(
...
...
@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
),
400
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Forward
),
10
0
)
profiler
.
TracerEventType
.
Forward
),
9
0
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Backward
),
80
)
...
...
@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
Communication
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
2
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
10
0
)
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
9
0
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
135
)
...
...
@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Forward
,
5
,
15
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
...
...
@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
),
400
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Forward
),
10
0
)
profiler
.
TracerEventType
.
Forward
),
9
0
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Backward
),
80
)
...
...
@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase):
distributed_summary
.
overlap_range
),
85
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
10
0
)
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
9
0
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
315
)
...
...
@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep
=
False
,
time_unit
=
'ms'
))
def
test_statistic_case3
(
self
):
# for coverage, test all time is 0
root_node
=
HostPythonNode
(
'Root Node'
,
profiler
.
TracerEventType
.
UserDefined
,
0
,
float
(
'inf'
),
1000
,
1001
)
profilerstep_node
=
HostPythonNode
(
'ProfileStep#1'
,
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
backward_node
=
HostPythonNode
(
'Gradient Backward'
,
profiler
.
TracerEventType
.
Backward
,
120
,
200
,
1000
,
1001
)
optimization_node
=
HostPythonNode
(
'Optimization'
,
profiler
.
TracerEventType
.
Optimization
,
220
,
300
,
1000
,
1001
)
userdefined_node
=
HostPythonNode
(
'Communication Time'
,
profiler
.
TracerEventType
.
UserDefined
,
60
,
70
,
1000
,
1001
)
conv2d_node
=
HostPythonNode
(
'conv2d'
,
profiler
.
TracerEventType
.
Operator
,
25
,
25
,
1000
,
1001
)
conv2d_infer_shape
=
HostPythonNode
(
'conv2d::infer_shape'
,
profiler
.
TracerEventType
.
OperatorInner
,
25
,
25
,
1000
,
1001
)
conv2d_compute
=
HostPythonNode
(
'conv2d::compute'
,
profiler
.
TracerEventType
.
OperatorInner
,
25
,
25
,
1000
,
1001
)
conv2d_launchkernel
=
HostPythonNode
(
'cudalaunchkernel'
,
profiler
.
TracerEventType
.
CudaRuntime
,
25
,
25
,
1000
,
1001
)
conv2d_kernel
=
DevicePythonNode
(
'conv2d_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
35
,
35
,
0
,
0
,
0
)
another_kernel
=
DevicePythonNode
(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()'
,
profiler
.
TracerEventType
.
Kernel
,
35
,
35
,
0
,
0
,
0
)
root_node
.
children_node
.
append
(
profilerstep_node
)
profilerstep_node
.
children_node
.
extend
([
dataloader_node
,
mobilenet_node
,
userdefined_node
,
backward_node
,
optimization_node
])
mobilenet_node
.
children_node
.
append
(
conv2d_node
)
conv2d_node
.
children_node
.
extend
([
conv2d_infer_shape
,
conv2d_compute
])
conv2d_compute
.
runtime_node
.
append
(
conv2d_launchkernel
)
conv2d_launchkernel
.
device_node
.
append
(
conv2d_kernel
)
conv2d_launchkernel
.
device_node
.
append
(
another_kernel
)
thread_tree
=
{
'thread1001'
:
root_node
}
extra_info
=
{
'Process Cpu Utilization'
:
'1.02'
,
'System Cpu Utilization'
:
'0.68'
}
statistic_data
=
profiler
.
profiler_statistic
.
StatisticData
(
thread_tree
,
extra_info
)
time_range_summary
=
statistic_data
.
time_range_summary
event_summary
=
statistic_data
.
event_summary
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
userdefined_items
[
'Communication Time'
]
.
general_gpu_time
,
0
)
for
sort_key
in
[
profiler
.
SortedKeys
.
CPUTotal
,
profiler
.
SortedKeys
.
CPUMax
,
profiler
.
SortedKeys
.
CPUMin
,
profiler
.
SortedKeys
.
CPUAvg
,
profiler
.
SortedKeys
.
GPUTotal
,
profiler
.
SortedKeys
.
GPUMax
,
profiler
.
SortedKeys
.
GPUMin
,
profiler
.
SortedKeys
.
GPUAvg
]:
print
(
profiler
.
profiler_statistic
.
_build_table
(
statistic_data
,
sorted_by
=
sort_key
,
op_detail
=
True
,
thread_sep
=
False
,
time_unit
=
'ms'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/profiler/profiler_statistic.py
浏览文件 @
9b54bf93
...
...
@@ -78,15 +78,19 @@ class HostStatisticNode:
self
.
self_gpu_time
=
0
self
.
general_gpu_time
=
0
# besides kernel, include time of gpu events like memcpy and memset
self
.
self_general_gpu_time
=
0
self
.
is_terminal_operator_node
=
True
def
cal_statistic
(
self
):
for
child
in
self
.
children_node
:
child
.
cal_statistic
()
if
child
.
is_terminal_operator_node
==
False
:
self
.
is_terminal_operator_node
=
False
for
rt
in
self
.
runtime_node
:
rt
.
cal_statistic
()
self
.
cpu_time
=
self
.
hostnode
.
end_ns
-
self
.
hostnode
.
start_ns
for
child
in
self
.
children_node
:
if
child
.
type
==
TracerEventType
.
Operator
:
self
.
is_terminal_operator_node
=
False
self
.
gpu_time
+=
child
.
gpu_time
self
.
general_gpu_time
+=
child
.
general_gpu_time
self
.
self_cpu_time
-=
(
child
.
end_ns
-
child
.
start_ns
)
...
...
@@ -419,10 +423,10 @@ class EventSummary:
for
runtimenode
in
node
.
runtime_node
:
for
devicenode
in
runtimenode
.
device_node
:
if
devicenode
.
name
not
in
self
.
devices
:
self
.
devices
[
devicenode
.
name
]
=
EventSummary
.
DeviceItem
(
devicenode
.
name
)
self
.
devices
[
devicenode
.
name
].
add_item
(
devicenode
)
name
=
devicenode
.
name
if
name
not
in
self
.
devices
:
self
.
devices
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
devices
[
name
].
add_item
(
devicenode
)
class
GeneralItem
:
def
__init__
(
self
,
name
):
...
...
@@ -489,6 +493,7 @@ class EventSummary:
dict
)
# for userdefined summary
self
.
model_perspective_items
=
{}
# for model summary
self
.
memory_manipulation_items
=
{}
# for memory manipulation summary
self
.
kernel_items
=
{}
# for kernel summary
def
parse
(
self
,
nodetrees
):
r
"""
...
...
@@ -508,6 +513,7 @@ class EventSummary:
self
.
add_memory_manipulation_item
(
host_statistic_node
)
else
:
self
.
add_userdefined_item
(
host_statistic_node
)
self
.
add_kernel_item
(
host_statistic_nodes
[
0
])
for
threadid
,
root_statistic_node
in
node_statistic_trees
.
items
():
deque
=
collections
.
deque
()
...
...
@@ -525,11 +531,7 @@ class EventSummary:
deque
.
append
(
child
)
def
add_operator_item
(
self
,
operator_node
):
have_inner
=
False
for
child
in
operator_node
.
children_node
:
if
child
.
type
==
TracerEventType
.
OperatorInner
:
have_inner
=
True
if
have_inner
==
False
:
if
operator_node
.
is_terminal_operator_node
==
False
:
return
if
operator_node
.
name
not
in
self
.
items
:
self
.
items
[
operator_node
.
name
]
=
EventSummary
.
OperatorItem
(
...
...
@@ -585,6 +587,15 @@ class EventSummary:
self
.
model_perspective_items
[
name
]
=
EventSummary
.
GeneralItem
(
name
)
self
.
model_perspective_items
[
name
].
add_item
(
model_perspective_node
)
def
add_kernel_item
(
self
,
root_node
):
device_nodes
=
get_device_nodes
(
root_node
)
for
device_node
in
device_nodes
:
if
device_node
.
type
==
TracerEventType
.
Kernel
:
name
=
device_node
.
name
if
name
not
in
self
.
kernel_items
:
self
.
kernel_items
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
kernel_items
[
name
].
add_item
(
device_node
)
class
StatisticData
:
r
"""
...
...
@@ -752,6 +763,9 @@ def _build_table(statistic_data,
cpu_call_times
[
event_type
]
=
statistic_data
.
event_summary
.
model_perspective_items
[
event_type_name
].
call
cpu_type_time
[
event_type
]
=
statistic_data
.
event_summary
.
model_perspective_items
[
event_type_name
].
cpu_time
gpu_time_range
=
collections
.
defaultdict
(
list
)
for
device_id
,
device_time_ranges
in
statistic_data
.
time_range_summary
.
GPUTimeRange
.
items
(
...
...
@@ -800,7 +814,6 @@ def _build_table(statistic_data,
append
(
"Note:
\n
In this table, We sum up all collected events in terms of event type.
\n
"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.
\n
"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.
\n
"
"Example:
\n
"
...
...
@@ -820,13 +833,18 @@ def _build_table(statistic_data,
all_row_values
=
[]
accmulation_time
=
0
gpu_accmulation_time
=
0
gpu_total_time
=
0
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
for
name
in
[
'ProfileStep'
,
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
]:
if
name
in
model_perspective_items
:
item
=
model_perspective_items
[
name
]
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
name
=
'{}'
.
format
(
name
)
if
'ProfileStep'
in
name
else
' {}'
.
format
(
name
)
row_values
=
[
...
...
@@ -850,17 +868,19 @@ def _build_table(statistic_data,
item
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
if
'ProfileStep'
not
in
name
:
accmulation_time
+=
item
.
cpu_time
gpu_accmulation_time
+=
item
.
gpu_time
else
:
gpu_total_time
=
item
.
gpu_time
gpu_accmulation_time
+=
item
.
general_gpu_time
other_time
=
total_time
-
accmulation_time
other_gpu_time
=
gpu_total_time
-
gpu_accmulation_time
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
other_gpu_time
)
/
gpu_total_time
row_values
=
[
' Others'
,
'-'
,
'{} / - / - / - / {}'
.
format
(
format_time
(
...
...
@@ -869,7 +889,7 @@ def _build_table(statistic_data,
'{} / - / - / - / {}'
.
format
(
format_time
(
other_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
other_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
# Calculate the column width
...
...
@@ -913,7 +933,6 @@ def _build_table(statistic_data,
append
(
"Note:
\n
In this table, GPU time is the sum of all device(GPU) events called in the phase.
\n
"
"Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
)
append
(
'-'
*
line_length
)
append
(
''
)
...
...
@@ -981,7 +1000,6 @@ def _build_table(statistic_data,
"Note:
\n
Communication time: Communication Event time, Communication Op time and its kernel time on gpu.
\n
"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).
\n
"
"Overlap time: Communication time intersects with computation time.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
"Example:
\n
"
"Communication:
\n
"
" CPU: |_________________|
\n
"
...
...
@@ -1040,8 +1058,22 @@ def _build_table(statistic_data,
elif
sorted_by
==
SortedKeys
.
GPUMin
:
sorted_items
=
sorted
(
items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_general_gpu_time
)
total_op_cpu_time
=
0
total_op_gpu_time
=
0
for
name
,
item
in
sorted_items
:
total_op_cpu_time
+=
item
.
cpu_time
total_op_gpu_time
+=
item
.
general_gpu_time
for
name
,
item
in
sorted_items
:
if
total_op_cpu_time
==
0
:
cpu_ratio
=
0
else
:
cpu_ratio
=
float
(
item
.
cpu_time
)
/
total_op_cpu_time
if
total_op_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
total_op_gpu_time
row_values
=
[
name
,
item
.
call
,
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
...
...
@@ -1052,7 +1084,7 @@ def _build_table(statistic_data,
item
.
max_cpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_cpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
cpu_time
)
/
total_time
)),
format_ratio
(
cpu_ratio
)),
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
item
.
general_gpu_time
,
unit
=
time_unit
),
...
...
@@ -1062,13 +1094,22 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
if
op_detail
:
for
innerop_name
,
innerop_node
in
item
.
operator_inners
.
items
(
):
if
item
.
cpu_time
==
0
:
cpu_ratio
=
0
else
:
cpu_ratio
=
float
(
innerop_node
.
cpu_time
)
/
item
.
cpu_time
if
item
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
innerop_node
.
general_gpu_time
)
/
item
.
general_gpu_time
if
len
(
innerop_name
)
+
2
>
name_column_width
:
innerop_name
=
innerop_name
[:
name_column_width
-
5
]
innerop_name
+=
"..."
...
...
@@ -1083,8 +1124,7 @@ def _build_table(statistic_data,
innerop_node
.
max_cpu_time
,
unit
=
time_unit
),
format_time
(
innerop_node
.
min_cpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
innerop_node
.
cpu_time
)
/
total_time
)),
format_ratio
(
cpu_ratio
)),
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
innerop_node
.
general_gpu_time
,
...
...
@@ -1098,13 +1138,17 @@ def _build_table(statistic_data,
format_time
(
innerop_node
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
innerop_node
.
general_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
for
device_node_name
,
device_node
in
innerop_node
.
devices
.
items
(
):
if
innerop_node
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
device_node
.
gpu_time
)
/
innerop_node
.
general_gpu_time
if
len
(
device_node_name
)
+
4
>
name_column_width
:
device_node_name
=
device_node_name
[:
name_column_width
...
...
@@ -1125,12 +1169,15 @@ def _build_table(statistic_data,
format_time
(
device_node
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
device_node
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
for
device_node_name
,
device_node
in
item
.
devices
.
items
():
if
item
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
device_node
.
gpu_time
)
/
item
.
general_gpu_time
if
len
(
device_node_name
)
+
2
>
name_column_width
:
device_node_name
=
device_node_name
[:
name_column_width
...
...
@@ -1148,8 +1195,7 @@ def _build_table(statistic_data,
device_node
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
device_node
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
device_node
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
# Calculate the column width
...
...
@@ -1197,11 +1243,106 @@ def _build_table(statistic_data,
append
(
''
)
append
(
''
)
###### Print Kernel Summary Report ######
if
statistic_data
.
event_summary
.
kernel_items
:
all_row_values
=
[]
kernel_items
=
statistic_data
.
event_summary
.
kernel_items
if
sorted_by
==
SortedKeys
.
GPUAvg
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
avg_gpu_time
,
reverse
=
True
)
elif
sorted_by
==
SortedKeys
.
GPUMax
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
max_gpu_time
,
reverse
=
True
)
elif
sorted_by
==
SortedKeys
.
GPUMin
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_gpu_time
)
else
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
total_kernel_gpu_time
=
0
for
name
,
item
in
sorted_items
:
total_kernel_gpu_time
+=
item
.
gpu_time
for
name
,
item
in
sorted_items
:
if
total_kernel_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
gpu_time
)
/
total_kernel_gpu_time
row_values
=
[
name
,
item
.
call
,
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
item
.
gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
avg_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
headers
=
[
'Name'
,
'Calls'
,
'GPU Total / Avg / Max / Min / Ratio(%)'
]
# Calculate the column width
name_column_width
=
90
calltime_width
=
6
gpu_data_description_width
=
40
for
row_values
in
all_row_values
:
if
isinstance
(
row_values
[
1
],
int
)
and
len
(
str
(
row_values
[
1
]))
>
calltime_width
:
calltime_width
=
len
(
str
(
row_values
[
1
]))
if
len
(
row_values
[
2
])
>
gpu_data_description_width
:
gpu_data_description_width
=
len
(
row_values
[
2
])
row_format_list
=
[
""
]
header_sep_list
=
[
""
]
line_length_list
=
[
-
SPACING_SIZE
]
add_column
(
name_column_width
)
add_column
(
calltime_width
)
add_column
(
gpu_data_description_width
)
row_format
=
row_format_list
[
0
]
header_sep
=
header_sep_list
[
0
]
line_length
=
line_length_list
[
0
]
# construct table string
append
(
add_title
(
line_length
,
"Kernel Summary"
))
append
(
'Time unit: {}'
.
format
(
time_unit
))
append
(
header_sep
)
append
(
row_format
.
format
(
*
headers
))
append
(
header_sep
)
for
row_values
in
all_row_values
:
indx
=
row_values
[
0
].
find
(
'('
)
if
indx
!=
-
1
:
name
=
row_values
[
0
][:
indx
]
else
:
name
=
row_values
[
0
]
if
len
(
name
)
>
name_column_width
:
row_values
[
0
]
=
name
[:
name_column_width
-
3
]
+
'...'
else
:
row_values
[
0
]
=
name
append
(
row_format
.
format
(
*
row_values
))
append
(
header_sep
)
append
(
''
)
append
(
''
)
###### Print Memory Manipulation Summary Report ######
if
statistic_data
.
event_summary
.
memory_manipulation_items
:
all_row_values
=
[]
memory_manipulation_items
=
statistic_data
.
event_summary
.
memory_manipulation_items
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
for
name
,
item
in
memory_manipulation_items
.
items
():
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
row_values
=
[
name
,
item
.
call
,
...
...
@@ -1224,7 +1365,7 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
)),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
...
...
@@ -1274,6 +1415,8 @@ def _build_table(statistic_data,
###### Print UserDefined Summary Report ######
if
statistic_data
.
event_summary
.
userdefined_items
:
all_row_values
=
[]
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
if
thread_sep
==
True
:
userdefined_thread_items
=
statistic_data
.
event_summary
.
userdefined_thread_items
else
:
...
...
@@ -1319,6 +1462,10 @@ def _build_table(statistic_data,
items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_general_gpu_time
)
for
name
,
item
in
sorted_items
:
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
row_values
=
[
name
,
item
.
call
,
...
...
@@ -1341,8 +1488,7 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
)),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录