Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
771811dc
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
771811dc
编写于
12月 01, 2022
作者:
C
Chitsing KUI
提交者:
GitHub
12月 01, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[FLOPS] add flops for layer (#48447)
* add flops for layer * hide api * add unitest
上级
758fccfe
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
175 addition
and
79 deletion
+175
-79
python/paddle/fluid/tests/unittests/test_newprofiler.py
python/paddle/fluid/tests/unittests/test_newprofiler.py
+1
-0
python/paddle/profiler/profiler.py
python/paddle/profiler/profiler.py
+20
-0
python/paddle/profiler/profiler_statistic.py
python/paddle/profiler/profiler_statistic.py
+151
-77
python/paddle/utils/flops.py
python/paddle/utils/flops.py
+3
-2
未找到文件。
python/paddle/fluid/tests/unittests/test_newprofiler.py
浏览文件 @
771811dc
...
...
@@ -87,6 +87,7 @@ class TestProfiler(unittest.TestCase):
targets
=
[
profiler
.
ProfilerTarget
.
CPU
],
scheduler
=
lambda
x
:
profiler
.
ProfilerState
.
RECORD_AND_RETURN
,
on_trace_ready
=
my_trace_back
,
with_flops
=
True
,
)
as
prof
:
for
i
in
range
(
2
):
y
=
x
/
2.0
...
...
python/paddle/profiler/profiler.py
浏览文件 @
771811dc
...
...
@@ -31,6 +31,14 @@ from paddle.fluid.core import (
enable_memory_recorder
,
enable_op_info_recorder
,
)
from
.utils
import
RecordEvent
,
wrap_optimizers
from
.profiler_statistic
import
(
StatisticData
,
_build_table
,
SortedKeys
,
gen_layer_flops
,
)
from
paddle.profiler
import
utils
from
.profiler_statistic
import
SortedKeys
,
StatisticData
,
_build_table
...
...
@@ -883,6 +891,18 @@ class Profiler:
)
)
if
self
.
with_flops
:
self
.
_print_flops
()
def
_print_flops
(
self
,
repeat
=
1
):
if
not
self
.
with_flops
:
print
(
'ERROR: with_flops disabled.'
)
return
print
(
" Flops Profiler Begin "
.
center
(
100
,
"-"
))
print
(
gen_layer_flops
(
self
.
profiler_result
.
get_data
(),
repeat
))
print
(
"- Flops Profiler End -"
.
center
(
100
,
"-"
))
def
get_profiler
(
config_path
):
try
:
...
...
python/paddle/profiler/profiler_statistic.py
浏览文件 @
771811dc
...
...
@@ -78,6 +78,16 @@ class SortedKeys(Enum):
GPUMin
=
7
def
_nodename2opname
(
name
):
r
'''
convert static host node name to operator name
'''
op_name
=
name
.
replace
(
' compute'
,
''
)
op_name
=
op_name
.
replace
(
' dygraph'
,
''
)
op_name
=
op_name
.
replace
(
' pybind_imperative_func'
,
''
)
return
op_name
class
HostStatisticNode
:
r
'''
Wrap original node for calculating statistic metrics.
...
...
@@ -98,10 +108,7 @@ class HostStatisticNode:
def
cal_flops
(
self
):
if
self
.
hostnode
.
type
==
TracerEventType
.
Operator
:
if
hasattr
(
self
.
hostnode
,
'input_shapes'
):
op_name
=
self
.
hostnode
.
name
op_name
=
op_name
.
replace
(
' compute'
,
''
)
op_name
=
op_name
.
replace
(
' dygraph'
,
''
)
op_name
=
op_name
.
replace
(
' pybind_imperative_func'
,
''
)
op_name
=
_nodename2opname
(
self
.
hostnode
.
name
)
self
.
flops
=
flops
(
op_name
,
self
.
hostnode
.
input_shapes
,
...
...
@@ -111,6 +118,7 @@ class HostStatisticNode:
def
cal_statistic
(
self
):
self
.
cpu_time
=
self
.
hostnode
.
end_ns
-
self
.
hostnode
.
start_ns
self
.
self_cpu_time
=
self
.
cpu_time
self
.
cal_flops
()
for
child
in
self
.
children_node
:
child
.
cal_flops
()
child
.
cal_statistic
()
...
...
@@ -177,6 +185,117 @@ def get_device_nodes(hostnode):
return
device_nodes
def
_build_layer_from_tree
(
nodetrees
):
def
build_layer
(
node
,
depth
=
0
):
if
"GradNode"
in
node
.
name
:
return
[],
0
if
node
.
type
in
[
TracerEventType
.
Backward
,
TracerEventType
.
Optimization
,
]:
return
[],
0
if
node
.
type
==
TracerEventType
.
Operator
:
stat_node
=
HostStatisticNode
(
node
)
stat_node
.
cal_statistic
()
return
stat_node
,
stat_node
.
flops
layer
=
[]
nflops
=
0
for
c
in
node
.
children_node
:
l
,
f
=
build_layer
(
c
,
depth
+
1
)
if
l
:
nflops
+=
f
layer
.
append
(
l
)
if
node
.
type
==
TracerEventType
.
Forward
:
stat_node
=
HostStatisticNode
(
node
)
stat_node
.
cal_statistic
()
stat_node
.
flops
=
nflops
return
[
stat_node
,
layer
],
nflops
return
layer
,
nflops
ret
=
[]
for
_
,
rootnode
in
nodetrees
.
items
():
layer
,
_
=
build_layer
(
rootnode
)
ret
.
append
(
layer
)
return
ret
def
_format_large_number
(
n
,
precision
=
2
):
if
n
//
1e12
>
0
:
return
"{} T"
.
format
(
round
(
n
/
1e12
,
precision
))
if
n
//
1e9
>
0
:
return
"{} G"
.
format
(
round
(
n
/
1e9
,
precision
))
if
n
//
1e6
>
0
:
return
"{} M"
.
format
(
round
(
n
/
1e6
,
precision
))
if
n
//
1e3
>
0
:
return
"{} K"
.
format
(
round
(
n
/
1e3
,
precision
))
return
"{}"
.
format
(
round
(
n
,
precision
))
def
_format_time
(
n
,
precision
=
2
):
if
n
//
1e9
>
0
:
return
"{} s"
.
format
(
round
(
n
/
1e9
,
precision
))
if
n
//
1e6
>
0
:
return
"{} ms"
.
format
(
round
(
n
/
1e6
,
precision
))
if
n
//
1e3
>
0
:
return
"{} us"
.
format
(
round
(
n
/
1e3
,
precision
))
return
"{} ns"
.
format
(
round
(
n
,
precision
))
def
_gen_layer_flops
(
node
,
repeat
=
1
):
ret
=
[]
offset
=
[]
loop
=
[]
def
print_layer_tree
(
node
,
depth
=
0
):
if
isinstance
(
node
,
list
):
for
n
in
node
:
print_layer_tree
(
n
,
depth
+
1
)
elif
node
.
type
in
[
TracerEventType
.
Forward
,
TracerEventType
.
Operator
]:
if
len
(
offset
)
==
0
:
offset
.
append
(
depth
)
name
=
_nodename2opname
(
node
.
name
)
if
(
depth
==
offset
[
-
1
]
and
len
(
ret
)
>
0
and
ret
[
0
].
startswith
(
name
)
):
# repeat begin
loop
.
append
(
1
)
if
len
(
loop
)
>=
repeat
:
return
""
.
join
(
ret
)
align
=
" "
*
(
depth
-
offset
[
-
1
])
tm
=
_format_time
(
node
.
cpu_time
)
flops_n
=
_format_large_number
(
node
.
flops
)
flops_s
=
_format_large_number
(
node
.
flops
*
1e9
/
node
.
cpu_time
)
ret
.
append
(
"{}{} latency: {}, FLOPs: {}, FLOPS: {}
\n
"
.
format
(
align
,
name
,
tm
,
flops_n
,
flops_s
)
)
for
n
in
node
[
1
:]:
print_layer_tree
(
n
)
return
""
.
join
(
ret
)
def
gen_layer_flops
(
nodetrees
,
repeat
=
1
):
r
'''
gen_layer_flops generate flops/runtime information depend on layer/operator.
'''
layer_tree
=
_build_layer_from_tree
(
nodetrees
)
return
_gen_layer_flops
(
layer_tree
,
repeat
)
def
wrap_tree
(
nodetrees
):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
...
...
@@ -387,30 +506,7 @@ class EventSummary:
Analyse operator event in profiling data, correlate with its device event.
"""
class
DeviceItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_item
(
self
,
node
):
self
.
call
+=
1
self
.
add_gpu_time
(
node
.
end_ns
-
node
.
start_ns
)
class
OperatorItem
:
class
ItemBase
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
...
...
@@ -470,6 +566,15 @@ class EventSummary:
def
add_flops
(
self
,
flops
):
self
.
_flops
+=
flops
def
add_item
(
self
,
node
):
raise
NotImplementedError
class
DeviceItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
call
+=
1
self
.
add_gpu_time
(
node
.
end_ns
-
node
.
start_ns
)
class
OperatorItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
...
...
@@ -491,56 +596,22 @@ class EventSummary:
self
.
devices
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
devices
[
name
].
add_item
(
devicenode
)
class
GeneralItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
cpu_time
=
0
self
.
max_cpu_time
=
0
self
.
min_cpu_time
=
float
(
'inf'
)
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
general_gpu_time
=
0
self
.
min_general_gpu_time
=
float
(
'inf'
)
self
.
max_general_gpu_time
=
0
@
property
def
avg_cpu_time
(
self
):
return
self
.
cpu_time
/
self
.
call
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
@
property
def
avg_general_gpu_time
(
self
):
return
self
.
general_gpu_time
/
self
.
call
def
add_cpu_time
(
self
,
time
):
if
time
>
self
.
max_cpu_time
:
self
.
max_cpu_time
=
time
if
time
<
self
.
min_cpu_time
:
self
.
min_cpu_time
=
time
self
.
cpu_time
+=
time
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_general_gpu_time
(
self
,
time
):
if
time
>
self
.
max_general_gpu_time
:
self
.
max_general_gpu_time
=
time
if
time
<
self
.
min_general_gpu_time
:
self
.
min_general_gpu_time
=
time
self
.
general_gpu_time
+=
time
def
add_call
(
self
):
self
.
call
+=
1
class
ForwardItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
self
.
add_gpu_time
(
node
.
gpu_time
)
self
.
add_general_gpu_time
(
node
.
general_gpu_time
)
self
.
add_flops
(
node
.
flops
)
for
child
in
node
.
children_node
:
if
child
.
type
!=
TracerEventType
.
Operator
:
if
child
.
name
not
in
self
.
operator_inners
:
self
.
operator_inners
[
child
.
name
]
=
EventSummary
.
OperatorItem
(
child
.
name
)
self
.
operator_inners
[
child
.
name
].
add_item
(
child
)
class
GeneralItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
...
...
@@ -613,6 +684,9 @@ class EventSummary:
self
.
add_model_perspective_item
(
child
)
deque
.
append
(
child
)
def
add_forward_item
(
self
,
operator_node
):
pass
def
add_operator_item
(
self
,
operator_node
):
if
operator_node
.
name
not
in
self
.
items
:
self
.
items
[
operator_node
.
name
]
=
EventSummary
.
OperatorItem
(
...
...
python/paddle/utils/flops.py
浏览文件 @
771811dc
...
...
@@ -164,8 +164,9 @@ def _matmul_flops(input_shapes, attrs):
shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1)) ... dim_n_1, dim_m]
equation: flops = 2 * numel(output) * dim_n
"""
x_shape
=
input_shapes
.
get
(
"X"
)[
0
]
y_shape
=
input_shapes
.
get
(
"Y"
)[
0
]
x_shape
=
input_shapes
.
get
(
"X"
,
input_shapes
.
get
(
"x"
,
[[
0
]]))[
0
]
y_shape
=
input_shapes
.
get
(
"Y"
,
input_shapes
.
get
(
"y"
,
[[
0
]]))[
0
]
if
attrs
.
get
(
'transpose_X'
)
or
attrs
.
get
(
'transpose_x'
):
x_shape
[
-
1
],
x_shape
[
-
2
]
=
x_shape
[
-
2
],
x_shape
[
-
1
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录