Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
9b54bf93
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9b54bf93
编写于
4月 19, 2022
作者:
C
chenjian
提交者:
GitHub
4月 19, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Rebase for profiler statistic ratio (#41939)
* fix according to suggestion * add kernel summary * improve coverage
上级
2b55290e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
273 addition
and
45 deletion
+273
-45
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
...n/paddle/fluid/tests/unittests/test_profiler_statistic.py
+92
-10
python/paddle/profiler/profiler_statistic.py
python/paddle/profiler/profiler_statistic.py
+181
-35
未找到文件。
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
浏览文件 @
9b54bf93
...
...
@@ -51,8 +51,9 @@ class TestProfilerStatistic(unittest.TestCase):
profilerstep_node
=
HostPythonNode
(
'ProfileStep#1'
,
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Forward
,
5
,
15
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
yolonet_node
=
HostPythonNode
(
...
...
@@ -155,7 +156,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
),
400
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Forward
),
10
0
)
profiler
.
TracerEventType
.
Forward
),
9
0
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Backward
),
80
)
...
...
@@ -185,12 +186,12 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
Communication
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
2
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
10
0
)
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
9
0
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
135
)
...
...
@@ -217,8 +218,9 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Forward
,
5
,
15
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
...
...
@@ -372,7 +374,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
ProfileStep
),
400
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Forward
),
10
0
)
profiler
.
TracerEventType
.
Forward
),
9
0
)
self
.
assertEqual
(
time_range_summary
.
get_cpu_range_sum
(
profiler
.
TracerEventType
.
Backward
),
80
)
...
...
@@ -417,12 +419,12 @@ class TestProfilerStatistic(unittest.TestCase):
distributed_summary
.
overlap_range
),
85
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
10
0
)
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
9
0
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
315
)
...
...
@@ -441,6 +443,86 @@ class TestProfilerStatistic(unittest.TestCase):
thread_sep
=
False
,
time_unit
=
'ms'
))
def
test_statistic_case3
(
self
):
# for coverage, test all time is 0
root_node
=
HostPythonNode
(
'Root Node'
,
profiler
.
TracerEventType
.
UserDefined
,
0
,
float
(
'inf'
),
1000
,
1001
)
profilerstep_node
=
HostPythonNode
(
'ProfileStep#1'
,
profiler
.
TracerEventType
.
ProfileStep
,
0
,
400
,
1000
,
1001
)
dataloader_node
=
HostPythonNode
(
'Dataloader'
,
profiler
.
TracerEventType
.
Dataloader
,
5
,
15
,
1000
,
1001
)
mobilenet_node
=
HostPythonNode
(
'MobileNet'
,
profiler
.
TracerEventType
.
Forward
,
20
,
50
,
1000
,
1001
)
backward_node
=
HostPythonNode
(
'Gradient Backward'
,
profiler
.
TracerEventType
.
Backward
,
120
,
200
,
1000
,
1001
)
optimization_node
=
HostPythonNode
(
'Optimization'
,
profiler
.
TracerEventType
.
Optimization
,
220
,
300
,
1000
,
1001
)
userdefined_node
=
HostPythonNode
(
'Communication Time'
,
profiler
.
TracerEventType
.
UserDefined
,
60
,
70
,
1000
,
1001
)
conv2d_node
=
HostPythonNode
(
'conv2d'
,
profiler
.
TracerEventType
.
Operator
,
25
,
25
,
1000
,
1001
)
conv2d_infer_shape
=
HostPythonNode
(
'conv2d::infer_shape'
,
profiler
.
TracerEventType
.
OperatorInner
,
25
,
25
,
1000
,
1001
)
conv2d_compute
=
HostPythonNode
(
'conv2d::compute'
,
profiler
.
TracerEventType
.
OperatorInner
,
25
,
25
,
1000
,
1001
)
conv2d_launchkernel
=
HostPythonNode
(
'cudalaunchkernel'
,
profiler
.
TracerEventType
.
CudaRuntime
,
25
,
25
,
1000
,
1001
)
conv2d_kernel
=
DevicePythonNode
(
'conv2d_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
35
,
35
,
0
,
0
,
0
)
another_kernel
=
DevicePythonNode
(
'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()'
,
profiler
.
TracerEventType
.
Kernel
,
35
,
35
,
0
,
0
,
0
)
root_node
.
children_node
.
append
(
profilerstep_node
)
profilerstep_node
.
children_node
.
extend
([
dataloader_node
,
mobilenet_node
,
userdefined_node
,
backward_node
,
optimization_node
])
mobilenet_node
.
children_node
.
append
(
conv2d_node
)
conv2d_node
.
children_node
.
extend
([
conv2d_infer_shape
,
conv2d_compute
])
conv2d_compute
.
runtime_node
.
append
(
conv2d_launchkernel
)
conv2d_launchkernel
.
device_node
.
append
(
conv2d_kernel
)
conv2d_launchkernel
.
device_node
.
append
(
another_kernel
)
thread_tree
=
{
'thread1001'
:
root_node
}
extra_info
=
{
'Process Cpu Utilization'
:
'1.02'
,
'System Cpu Utilization'
:
'0.68'
}
statistic_data
=
profiler
.
profiler_statistic
.
StatisticData
(
thread_tree
,
extra_info
)
time_range_summary
=
statistic_data
.
time_range_summary
event_summary
=
statistic_data
.
event_summary
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
general_gpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
userdefined_items
[
'Communication Time'
]
.
general_gpu_time
,
0
)
for
sort_key
in
[
profiler
.
SortedKeys
.
CPUTotal
,
profiler
.
SortedKeys
.
CPUMax
,
profiler
.
SortedKeys
.
CPUMin
,
profiler
.
SortedKeys
.
CPUAvg
,
profiler
.
SortedKeys
.
GPUTotal
,
profiler
.
SortedKeys
.
GPUMax
,
profiler
.
SortedKeys
.
GPUMin
,
profiler
.
SortedKeys
.
GPUAvg
]:
print
(
profiler
.
profiler_statistic
.
_build_table
(
statistic_data
,
sorted_by
=
sort_key
,
op_detail
=
True
,
thread_sep
=
False
,
time_unit
=
'ms'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/profiler/profiler_statistic.py
浏览文件 @
9b54bf93
...
...
@@ -78,15 +78,19 @@ class HostStatisticNode:
self
.
self_gpu_time
=
0
self
.
general_gpu_time
=
0
# besides kernel, include time of gpu events like memcpy and memset
self
.
self_general_gpu_time
=
0
self
.
is_terminal_operator_node
=
True
def
cal_statistic
(
self
):
for
child
in
self
.
children_node
:
child
.
cal_statistic
()
if
child
.
is_terminal_operator_node
==
False
:
self
.
is_terminal_operator_node
=
False
for
rt
in
self
.
runtime_node
:
rt
.
cal_statistic
()
self
.
cpu_time
=
self
.
hostnode
.
end_ns
-
self
.
hostnode
.
start_ns
for
child
in
self
.
children_node
:
if
child
.
type
==
TracerEventType
.
Operator
:
self
.
is_terminal_operator_node
=
False
self
.
gpu_time
+=
child
.
gpu_time
self
.
general_gpu_time
+=
child
.
general_gpu_time
self
.
self_cpu_time
-=
(
child
.
end_ns
-
child
.
start_ns
)
...
...
@@ -419,10 +423,10 @@ class EventSummary:
for
runtimenode
in
node
.
runtime_node
:
for
devicenode
in
runtimenode
.
device_node
:
if
devicenode
.
name
not
in
self
.
devices
:
self
.
devices
[
devicenode
.
name
]
=
EventSummary
.
DeviceItem
(
devicenode
.
name
)
self
.
devices
[
devicenode
.
name
].
add_item
(
devicenode
)
name
=
devicenode
.
name
if
name
not
in
self
.
devices
:
self
.
devices
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
devices
[
name
].
add_item
(
devicenode
)
class
GeneralItem
:
def
__init__
(
self
,
name
):
...
...
@@ -489,6 +493,7 @@ class EventSummary:
dict
)
# for userdefined summary
self
.
model_perspective_items
=
{}
# for model summary
self
.
memory_manipulation_items
=
{}
# for memory manipulation summary
self
.
kernel_items
=
{}
# for kernel summary
def
parse
(
self
,
nodetrees
):
r
"""
...
...
@@ -508,6 +513,7 @@ class EventSummary:
self
.
add_memory_manipulation_item
(
host_statistic_node
)
else
:
self
.
add_userdefined_item
(
host_statistic_node
)
self
.
add_kernel_item
(
host_statistic_nodes
[
0
])
for
threadid
,
root_statistic_node
in
node_statistic_trees
.
items
():
deque
=
collections
.
deque
()
...
...
@@ -525,11 +531,7 @@ class EventSummary:
deque
.
append
(
child
)
def
add_operator_item
(
self
,
operator_node
):
have_inner
=
False
for
child
in
operator_node
.
children_node
:
if
child
.
type
==
TracerEventType
.
OperatorInner
:
have_inner
=
True
if
have_inner
==
False
:
if
operator_node
.
is_terminal_operator_node
==
False
:
return
if
operator_node
.
name
not
in
self
.
items
:
self
.
items
[
operator_node
.
name
]
=
EventSummary
.
OperatorItem
(
...
...
@@ -585,6 +587,15 @@ class EventSummary:
self
.
model_perspective_items
[
name
]
=
EventSummary
.
GeneralItem
(
name
)
self
.
model_perspective_items
[
name
].
add_item
(
model_perspective_node
)
def
add_kernel_item
(
self
,
root_node
):
device_nodes
=
get_device_nodes
(
root_node
)
for
device_node
in
device_nodes
:
if
device_node
.
type
==
TracerEventType
.
Kernel
:
name
=
device_node
.
name
if
name
not
in
self
.
kernel_items
:
self
.
kernel_items
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
kernel_items
[
name
].
add_item
(
device_node
)
class
StatisticData
:
r
"""
...
...
@@ -752,6 +763,9 @@ def _build_table(statistic_data,
cpu_call_times
[
event_type
]
=
statistic_data
.
event_summary
.
model_perspective_items
[
event_type_name
].
call
cpu_type_time
[
event_type
]
=
statistic_data
.
event_summary
.
model_perspective_items
[
event_type_name
].
cpu_time
gpu_time_range
=
collections
.
defaultdict
(
list
)
for
device_id
,
device_time_ranges
in
statistic_data
.
time_range_summary
.
GPUTimeRange
.
items
(
...
...
@@ -800,7 +814,6 @@ def _build_table(statistic_data,
append
(
"Note:
\n
In this table, We sum up all collected events in terms of event type.
\n
"
"The time of events collected on host are presented as CPU Time, and as GPU Time if on device.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
"Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.
\n
"
"The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.
\n
"
"Example:
\n
"
...
...
@@ -820,13 +833,18 @@ def _build_table(statistic_data,
all_row_values
=
[]
accmulation_time
=
0
gpu_accmulation_time
=
0
gpu_total_time
=
0
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
for
name
in
[
'ProfileStep'
,
'Dataloader'
,
'Forward'
,
'Backward'
,
'Optimization'
]:
if
name
in
model_perspective_items
:
item
=
model_perspective_items
[
name
]
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
name
=
'{}'
.
format
(
name
)
if
'ProfileStep'
in
name
else
' {}'
.
format
(
name
)
row_values
=
[
...
...
@@ -850,17 +868,19 @@ def _build_table(statistic_data,
item
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
if
'ProfileStep'
not
in
name
:
accmulation_time
+=
item
.
cpu_time
gpu_accmulation_time
+=
item
.
gpu_time
else
:
gpu_total_time
=
item
.
gpu_time
gpu_accmulation_time
+=
item
.
general_gpu_time
other_time
=
total_time
-
accmulation_time
other_gpu_time
=
gpu_total_time
-
gpu_accmulation_time
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
other_gpu_time
)
/
gpu_total_time
row_values
=
[
' Others'
,
'-'
,
'{} / - / - / - / {}'
.
format
(
format_time
(
...
...
@@ -869,7 +889,7 @@ def _build_table(statistic_data,
'{} / - / - / - / {}'
.
format
(
format_time
(
other_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
other_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
# Calculate the column width
...
...
@@ -913,7 +933,6 @@ def _build_table(statistic_data,
append
(
"Note:
\n
In this table, GPU time is the sum of all device(GPU) events called in the phase.
\n
"
"Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
)
append
(
'-'
*
line_length
)
append
(
''
)
...
...
@@ -981,7 +1000,6 @@ def _build_table(statistic_data,
"Note:
\n
Communication time: Communication Event time, Communication Op time and its kernel time on gpu.
\n
"
"Computation time: Kernel time, except kernels belong to communication(nccl kernels).
\n
"
"Overlap time: Communication time intersects with computation time.
\n
"
"The time with ratio 100% is the base time for calculating ratio.
\n
"
"Example:
\n
"
"Communication:
\n
"
" CPU: |_________________|
\n
"
...
...
@@ -1040,8 +1058,22 @@ def _build_table(statistic_data,
elif
sorted_by
==
SortedKeys
.
GPUMin
:
sorted_items
=
sorted
(
items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_general_gpu_time
)
total_op_cpu_time
=
0
total_op_gpu_time
=
0
for
name
,
item
in
sorted_items
:
total_op_cpu_time
+=
item
.
cpu_time
total_op_gpu_time
+=
item
.
general_gpu_time
for
name
,
item
in
sorted_items
:
if
total_op_cpu_time
==
0
:
cpu_ratio
=
0
else
:
cpu_ratio
=
float
(
item
.
cpu_time
)
/
total_op_cpu_time
if
total_op_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
total_op_gpu_time
row_values
=
[
name
,
item
.
call
,
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
...
...
@@ -1052,7 +1084,7 @@ def _build_table(statistic_data,
item
.
max_cpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_cpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
cpu_time
)
/
total_time
)),
format_ratio
(
cpu_ratio
)),
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
item
.
general_gpu_time
,
unit
=
time_unit
),
...
...
@@ -1062,13 +1094,22 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
if
op_detail
:
for
innerop_name
,
innerop_node
in
item
.
operator_inners
.
items
(
):
if
item
.
cpu_time
==
0
:
cpu_ratio
=
0
else
:
cpu_ratio
=
float
(
innerop_node
.
cpu_time
)
/
item
.
cpu_time
if
item
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
innerop_node
.
general_gpu_time
)
/
item
.
general_gpu_time
if
len
(
innerop_name
)
+
2
>
name_column_width
:
innerop_name
=
innerop_name
[:
name_column_width
-
5
]
innerop_name
+=
"..."
...
...
@@ -1083,8 +1124,7 @@ def _build_table(statistic_data,
innerop_node
.
max_cpu_time
,
unit
=
time_unit
),
format_time
(
innerop_node
.
min_cpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
innerop_node
.
cpu_time
)
/
total_time
)),
format_ratio
(
cpu_ratio
)),
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
innerop_node
.
general_gpu_time
,
...
...
@@ -1098,13 +1138,17 @@ def _build_table(statistic_data,
format_time
(
innerop_node
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
innerop_node
.
general_gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
for
device_node_name
,
device_node
in
innerop_node
.
devices
.
items
(
):
if
innerop_node
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
device_node
.
gpu_time
)
/
innerop_node
.
general_gpu_time
if
len
(
device_node_name
)
+
4
>
name_column_width
:
device_node_name
=
device_node_name
[:
name_column_width
...
...
@@ -1125,12 +1169,15 @@ def _build_table(statistic_data,
format_time
(
device_node
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
device_node
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
for
device_node_name
,
device_node
in
item
.
devices
.
items
():
if
item
.
general_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
device_node
.
gpu_time
)
/
item
.
general_gpu_time
if
len
(
device_node_name
)
+
2
>
name_column_width
:
device_node_name
=
device_node_name
[:
name_column_width
...
...
@@ -1148,8 +1195,7 @@ def _build_table(statistic_data,
device_node
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
device_node
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
device_node
.
gpu_time
)
/
total_time
))
format_ratio
(
gpu_ratio
))
]
all_row_values
.
append
(
row_values
)
# Calculate the column width
...
...
@@ -1197,11 +1243,106 @@ def _build_table(statistic_data,
append
(
''
)
append
(
''
)
###### Print Kernel Summary Report ######
if
statistic_data
.
event_summary
.
kernel_items
:
all_row_values
=
[]
kernel_items
=
statistic_data
.
event_summary
.
kernel_items
if
sorted_by
==
SortedKeys
.
GPUAvg
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
avg_gpu_time
,
reverse
=
True
)
elif
sorted_by
==
SortedKeys
.
GPUMax
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
max_gpu_time
,
reverse
=
True
)
elif
sorted_by
==
SortedKeys
.
GPUMin
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_gpu_time
)
else
:
sorted_items
=
sorted
(
kernel_items
.
items
(),
key
=
lambda
x
:
x
[
1
].
gpu_time
,
reverse
=
True
)
total_kernel_gpu_time
=
0
for
name
,
item
in
sorted_items
:
total_kernel_gpu_time
+=
item
.
gpu_time
for
name
,
item
in
sorted_items
:
if
total_kernel_gpu_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
gpu_time
)
/
total_kernel_gpu_time
row_values
=
[
name
,
item
.
call
,
'{} / {} / {} / {} / {}'
.
format
(
format_time
(
item
.
gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
avg_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
max_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_gpu_time
,
unit
=
time_unit
),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
headers
=
[
'Name'
,
'Calls'
,
'GPU Total / Avg / Max / Min / Ratio(%)'
]
# Calculate the column width
name_column_width
=
90
calltime_width
=
6
gpu_data_description_width
=
40
for
row_values
in
all_row_values
:
if
isinstance
(
row_values
[
1
],
int
)
and
len
(
str
(
row_values
[
1
]))
>
calltime_width
:
calltime_width
=
len
(
str
(
row_values
[
1
]))
if
len
(
row_values
[
2
])
>
gpu_data_description_width
:
gpu_data_description_width
=
len
(
row_values
[
2
])
row_format_list
=
[
""
]
header_sep_list
=
[
""
]
line_length_list
=
[
-
SPACING_SIZE
]
add_column
(
name_column_width
)
add_column
(
calltime_width
)
add_column
(
gpu_data_description_width
)
row_format
=
row_format_list
[
0
]
header_sep
=
header_sep_list
[
0
]
line_length
=
line_length_list
[
0
]
# construct table string
append
(
add_title
(
line_length
,
"Kernel Summary"
))
append
(
'Time unit: {}'
.
format
(
time_unit
))
append
(
header_sep
)
append
(
row_format
.
format
(
*
headers
))
append
(
header_sep
)
for
row_values
in
all_row_values
:
indx
=
row_values
[
0
].
find
(
'('
)
if
indx
!=
-
1
:
name
=
row_values
[
0
][:
indx
]
else
:
name
=
row_values
[
0
]
if
len
(
name
)
>
name_column_width
:
row_values
[
0
]
=
name
[:
name_column_width
-
3
]
+
'...'
else
:
row_values
[
0
]
=
name
append
(
row_format
.
format
(
*
row_values
))
append
(
header_sep
)
append
(
''
)
append
(
''
)
###### Print Memory Manipulation Summary Report ######
if
statistic_data
.
event_summary
.
memory_manipulation_items
:
all_row_values
=
[]
memory_manipulation_items
=
statistic_data
.
event_summary
.
memory_manipulation_items
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
for
name
,
item
in
memory_manipulation_items
.
items
():
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
row_values
=
[
name
,
item
.
call
,
...
...
@@ -1224,7 +1365,7 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
)),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
...
...
@@ -1274,6 +1415,8 @@ def _build_table(statistic_data,
###### Print UserDefined Summary Report ######
if
statistic_data
.
event_summary
.
userdefined_items
:
all_row_values
=
[]
gpu_total_time
=
statistic_data
.
event_summary
.
model_perspective_items
[
'ProfileStep'
].
general_gpu_time
if
thread_sep
==
True
:
userdefined_thread_items
=
statistic_data
.
event_summary
.
userdefined_thread_items
else
:
...
...
@@ -1319,6 +1462,10 @@ def _build_table(statistic_data,
items
.
items
(),
key
=
lambda
x
:
x
[
1
].
min_general_gpu_time
)
for
name
,
item
in
sorted_items
:
if
gpu_total_time
==
0
:
gpu_ratio
=
0
else
:
gpu_ratio
=
float
(
item
.
general_gpu_time
)
/
gpu_total_time
row_values
=
[
name
,
item
.
call
,
...
...
@@ -1341,8 +1488,7 @@ def _build_table(statistic_data,
item
.
max_general_gpu_time
,
unit
=
time_unit
),
format_time
(
item
.
min_general_gpu_time
,
unit
=
time_unit
),
format_ratio
(
float
(
item
.
general_gpu_time
)
/
total_time
)),
format_ratio
(
gpu_ratio
)),
]
all_row_values
.
append
(
row_values
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录