Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
771811dc
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
771811dc
编写于
12月 01, 2022
作者:
C
Chitsing KUI
提交者:
GitHub
12月 01, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[FLOPS] add flops for layer (#48447)
* add flops for layer * hide api * add unitest
上级
758fccfe
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
175 addition
and
79 deletion
+175
-79
python/paddle/fluid/tests/unittests/test_newprofiler.py
python/paddle/fluid/tests/unittests/test_newprofiler.py
+1
-0
python/paddle/profiler/profiler.py
python/paddle/profiler/profiler.py
+20
-0
python/paddle/profiler/profiler_statistic.py
python/paddle/profiler/profiler_statistic.py
+151
-77
python/paddle/utils/flops.py
python/paddle/utils/flops.py
+3
-2
未找到文件。
python/paddle/fluid/tests/unittests/test_newprofiler.py
浏览文件 @
771811dc
...
...
@@ -87,6 +87,7 @@ class TestProfiler(unittest.TestCase):
targets
=
[
profiler
.
ProfilerTarget
.
CPU
],
scheduler
=
lambda
x
:
profiler
.
ProfilerState
.
RECORD_AND_RETURN
,
on_trace_ready
=
my_trace_back
,
with_flops
=
True
,
)
as
prof
:
for
i
in
range
(
2
):
y
=
x
/
2.0
...
...
python/paddle/profiler/profiler.py
浏览文件 @
771811dc
...
...
@@ -31,6 +31,14 @@ from paddle.fluid.core import (
enable_memory_recorder
,
enable_op_info_recorder
,
)
from
.utils
import
RecordEvent
,
wrap_optimizers
from
.profiler_statistic
import
(
StatisticData
,
_build_table
,
SortedKeys
,
gen_layer_flops
,
)
from
paddle.profiler
import
utils
from
.profiler_statistic
import
SortedKeys
,
StatisticData
,
_build_table
...
...
@@ -883,6 +891,18 @@ class Profiler:
)
)
if
self
.
with_flops
:
self
.
_print_flops
()
def
_print_flops
(
self
,
repeat
=
1
):
if
not
self
.
with_flops
:
print
(
'ERROR: with_flops disabled.'
)
return
print
(
" Flops Profiler Begin "
.
center
(
100
,
"-"
))
print
(
gen_layer_flops
(
self
.
profiler_result
.
get_data
(),
repeat
))
print
(
"- Flops Profiler End -"
.
center
(
100
,
"-"
))
def
get_profiler
(
config_path
):
try
:
...
...
python/paddle/profiler/profiler_statistic.py
浏览文件 @
771811dc
...
...
@@ -78,6 +78,16 @@ class SortedKeys(Enum):
GPUMin
=
7
def
_nodename2opname
(
name
):
r
'''
convert static host node name to operator name
'''
op_name
=
name
.
replace
(
' compute'
,
''
)
op_name
=
op_name
.
replace
(
' dygraph'
,
''
)
op_name
=
op_name
.
replace
(
' pybind_imperative_func'
,
''
)
return
op_name
class
HostStatisticNode
:
r
'''
Wrap original node for calculating statistic metrics.
...
...
@@ -98,10 +108,7 @@ class HostStatisticNode:
def
cal_flops
(
self
):
if
self
.
hostnode
.
type
==
TracerEventType
.
Operator
:
if
hasattr
(
self
.
hostnode
,
'input_shapes'
):
op_name
=
self
.
hostnode
.
name
op_name
=
op_name
.
replace
(
' compute'
,
''
)
op_name
=
op_name
.
replace
(
' dygraph'
,
''
)
op_name
=
op_name
.
replace
(
' pybind_imperative_func'
,
''
)
op_name
=
_nodename2opname
(
self
.
hostnode
.
name
)
self
.
flops
=
flops
(
op_name
,
self
.
hostnode
.
input_shapes
,
...
...
@@ -111,6 +118,7 @@ class HostStatisticNode:
def
cal_statistic
(
self
):
self
.
cpu_time
=
self
.
hostnode
.
end_ns
-
self
.
hostnode
.
start_ns
self
.
self_cpu_time
=
self
.
cpu_time
self
.
cal_flops
()
for
child
in
self
.
children_node
:
child
.
cal_flops
()
child
.
cal_statistic
()
...
...
@@ -177,6 +185,117 @@ def get_device_nodes(hostnode):
return
device_nodes
def
_build_layer_from_tree
(
nodetrees
):
def
build_layer
(
node
,
depth
=
0
):
if
"GradNode"
in
node
.
name
:
return
[],
0
if
node
.
type
in
[
TracerEventType
.
Backward
,
TracerEventType
.
Optimization
,
]:
return
[],
0
if
node
.
type
==
TracerEventType
.
Operator
:
stat_node
=
HostStatisticNode
(
node
)
stat_node
.
cal_statistic
()
return
stat_node
,
stat_node
.
flops
layer
=
[]
nflops
=
0
for
c
in
node
.
children_node
:
l
,
f
=
build_layer
(
c
,
depth
+
1
)
if
l
:
nflops
+=
f
layer
.
append
(
l
)
if
node
.
type
==
TracerEventType
.
Forward
:
stat_node
=
HostStatisticNode
(
node
)
stat_node
.
cal_statistic
()
stat_node
.
flops
=
nflops
return
[
stat_node
,
layer
],
nflops
return
layer
,
nflops
ret
=
[]
for
_
,
rootnode
in
nodetrees
.
items
():
layer
,
_
=
build_layer
(
rootnode
)
ret
.
append
(
layer
)
return
ret
def
_format_large_number
(
n
,
precision
=
2
):
if
n
//
1e12
>
0
:
return
"{} T"
.
format
(
round
(
n
/
1e12
,
precision
))
if
n
//
1e9
>
0
:
return
"{} G"
.
format
(
round
(
n
/
1e9
,
precision
))
if
n
//
1e6
>
0
:
return
"{} M"
.
format
(
round
(
n
/
1e6
,
precision
))
if
n
//
1e3
>
0
:
return
"{} K"
.
format
(
round
(
n
/
1e3
,
precision
))
return
"{}"
.
format
(
round
(
n
,
precision
))
def
_format_time
(
n
,
precision
=
2
):
if
n
//
1e9
>
0
:
return
"{} s"
.
format
(
round
(
n
/
1e9
,
precision
))
if
n
//
1e6
>
0
:
return
"{} ms"
.
format
(
round
(
n
/
1e6
,
precision
))
if
n
//
1e3
>
0
:
return
"{} us"
.
format
(
round
(
n
/
1e3
,
precision
))
return
"{} ns"
.
format
(
round
(
n
,
precision
))
def
_gen_layer_flops
(
node
,
repeat
=
1
):
ret
=
[]
offset
=
[]
loop
=
[]
def
print_layer_tree
(
node
,
depth
=
0
):
if
isinstance
(
node
,
list
):
for
n
in
node
:
print_layer_tree
(
n
,
depth
+
1
)
elif
node
.
type
in
[
TracerEventType
.
Forward
,
TracerEventType
.
Operator
]:
if
len
(
offset
)
==
0
:
offset
.
append
(
depth
)
name
=
_nodename2opname
(
node
.
name
)
if
(
depth
==
offset
[
-
1
]
and
len
(
ret
)
>
0
and
ret
[
0
].
startswith
(
name
)
):
# repeat begin
loop
.
append
(
1
)
if
len
(
loop
)
>=
repeat
:
return
""
.
join
(
ret
)
align
=
" "
*
(
depth
-
offset
[
-
1
])
tm
=
_format_time
(
node
.
cpu_time
)
flops_n
=
_format_large_number
(
node
.
flops
)
flops_s
=
_format_large_number
(
node
.
flops
*
1e9
/
node
.
cpu_time
)
ret
.
append
(
"{}{} latency: {}, FLOPs: {}, FLOPS: {}
\n
"
.
format
(
align
,
name
,
tm
,
flops_n
,
flops_s
)
)
for
n
in
node
[
1
:]:
print_layer_tree
(
n
)
return
""
.
join
(
ret
)
def
gen_layer_flops
(
nodetrees
,
repeat
=
1
):
r
'''
gen_layer_flops generate flops/runtime information depend on layer/operator.
'''
layer_tree
=
_build_layer_from_tree
(
nodetrees
)
return
_gen_layer_flops
(
layer_tree
,
repeat
)
def
wrap_tree
(
nodetrees
):
'''
Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
...
...
@@ -387,30 +506,7 @@ class EventSummary:
Analyse operator event in profiling data, correlate with its device event.
"""
class
DeviceItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_item
(
self
,
node
):
self
.
call
+=
1
self
.
add_gpu_time
(
node
.
end_ns
-
node
.
start_ns
)
class
OperatorItem
:
class
ItemBase
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
...
...
@@ -470,6 +566,15 @@ class EventSummary:
def
add_flops
(
self
,
flops
):
self
.
_flops
+=
flops
def
add_item
(
self
,
node
):
raise
NotImplementedError
class
DeviceItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
call
+=
1
self
.
add_gpu_time
(
node
.
end_ns
-
node
.
start_ns
)
class
OperatorItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
...
...
@@ -491,56 +596,22 @@ class EventSummary:
self
.
devices
[
name
]
=
EventSummary
.
DeviceItem
(
name
)
self
.
devices
[
name
].
add_item
(
devicenode
)
class
GeneralItem
:
def
__init__
(
self
,
name
):
self
.
name
=
name
self
.
call
=
0
self
.
cpu_time
=
0
self
.
max_cpu_time
=
0
self
.
min_cpu_time
=
float
(
'inf'
)
self
.
gpu_time
=
0
self
.
max_gpu_time
=
0
self
.
min_gpu_time
=
float
(
'inf'
)
self
.
general_gpu_time
=
0
self
.
min_general_gpu_time
=
float
(
'inf'
)
self
.
max_general_gpu_time
=
0
@
property
def
avg_cpu_time
(
self
):
return
self
.
cpu_time
/
self
.
call
@
property
def
avg_gpu_time
(
self
):
return
self
.
gpu_time
/
self
.
call
@
property
def
avg_general_gpu_time
(
self
):
return
self
.
general_gpu_time
/
self
.
call
def
add_cpu_time
(
self
,
time
):
if
time
>
self
.
max_cpu_time
:
self
.
max_cpu_time
=
time
if
time
<
self
.
min_cpu_time
:
self
.
min_cpu_time
=
time
self
.
cpu_time
+=
time
def
add_gpu_time
(
self
,
time
):
if
time
>
self
.
max_gpu_time
:
self
.
max_gpu_time
=
time
if
time
<
self
.
min_gpu_time
:
self
.
min_gpu_time
=
time
self
.
gpu_time
+=
time
def
add_general_gpu_time
(
self
,
time
):
if
time
>
self
.
max_general_gpu_time
:
self
.
max_general_gpu_time
=
time
if
time
<
self
.
min_general_gpu_time
:
self
.
min_general_gpu_time
=
time
self
.
general_gpu_time
+=
time
def
add_call
(
self
):
self
.
call
+=
1
class
ForwardItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
self
.
add_gpu_time
(
node
.
gpu_time
)
self
.
add_general_gpu_time
(
node
.
general_gpu_time
)
self
.
add_flops
(
node
.
flops
)
for
child
in
node
.
children_node
:
if
child
.
type
!=
TracerEventType
.
Operator
:
if
child
.
name
not
in
self
.
operator_inners
:
self
.
operator_inners
[
child
.
name
]
=
EventSummary
.
OperatorItem
(
child
.
name
)
self
.
operator_inners
[
child
.
name
].
add_item
(
child
)
class
GeneralItem
(
ItemBase
):
def
add_item
(
self
,
node
):
self
.
add_call
()
self
.
add_cpu_time
(
node
.
cpu_time
)
...
...
@@ -613,6 +684,9 @@ class EventSummary:
self
.
add_model_perspective_item
(
child
)
deque
.
append
(
child
)
def
add_forward_item
(
self
,
operator_node
):
pass
def
add_operator_item
(
self
,
operator_node
):
if
operator_node
.
name
not
in
self
.
items
:
self
.
items
[
operator_node
.
name
]
=
EventSummary
.
OperatorItem
(
...
...
python/paddle/utils/flops.py
浏览文件 @
771811dc
...
...
@@ -164,8 +164,9 @@ def _matmul_flops(input_shapes, attrs):
shape_of_output = [dim1, dim2 ... max(dim(n-m), odim(n-m)), max(dim(n-m+1), odim(n-m+1)) ... dim_n_1, dim_m]
equation: flops = 2 * numel(output) * dim_n
"""
x_shape
=
input_shapes
.
get
(
"X"
)[
0
]
y_shape
=
input_shapes
.
get
(
"Y"
)[
0
]
x_shape
=
input_shapes
.
get
(
"X"
,
input_shapes
.
get
(
"x"
,
[[
0
]]))[
0
]
y_shape
=
input_shapes
.
get
(
"Y"
,
input_shapes
.
get
(
"y"
,
[[
0
]]))[
0
]
if
attrs
.
get
(
'transpose_X'
)
or
attrs
.
get
(
'transpose_x'
):
x_shape
[
-
1
],
x_shape
[
-
2
]
=
x_shape
[
-
2
],
x_shape
[
-
1
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录