Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
f43af275
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f43af275
编写于
4月 08, 2022
作者:
C
chenjian
提交者:
GitHub
4月 08, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refine statistic table (#41524)
上级
14dba636
变更
2
展开全部
显示空白变更内容
内联
并排
Showing
2 changed file
with
205 addition
and
114 deletion
+205
-114
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
...n/paddle/fluid/tests/unittests/test_profiler_statistic.py
+46
-42
python/paddle/profiler/profiler_statistic.py
python/paddle/profiler/profiler_statistic.py
+159
-72
未找到文件。
python/paddle/fluid/tests/unittests/test_profiler_statistic.py
浏览文件 @
f43af275
...
...
@@ -185,20 +185,22 @@ class TestProfilerStatistic(unittest.TestCase):
profiler
.
TracerEventType
.
Communication
),
5
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
2
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
3
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
g
eneral_g
pu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
100
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
gpu_time
,
135
)
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
135
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Backward'
].
gpu_time
,
0
)
event_summary
.
model_perspective_items
[
'Backward'
].
general_gpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
].
gpu_time
,
60
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
]
.
general_
gpu_time
,
60
)
print
(
profiler
.
profiler_statistic
.
_build_table
(
statistic_data
,
...
...
@@ -226,31 +228,31 @@ class TestProfilerStatistic(unittest.TestCase):
userdefined_node
=
HostPythonNode
(
'Communication Time'
,
profiler
.
TracerEventType
.
UserDefined
,
100
,
110
,
1000
,
1001
)
reduce_all
_launchkernel0
=
HostPythonNode
(
allreduce
_launchkernel0
=
HostPythonNode
(
'cudalaunchkernel'
,
profiler
.
TracerEventType
.
CudaRuntime
,
102
,
104
,
1000
,
1001
)
nccl_
reduce_all
_kernel0
=
DevicePythonNode
(
'nccl_
reduce_all
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
105
,
120
,
nccl_
allreduce
_kernel0
=
DevicePythonNode
(
'nccl_
allreduce
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
105
,
120
,
0
,
0
,
2
)
communication_node
=
HostPythonNode
(
'Communication'
,
profiler
.
TracerEventType
.
Communication
,
105
,
110
,
1000
,
1001
)
reduce_all_op1
=
HostPythonNode
(
'reduce_all
_op1'
,
allreduce_op1
=
HostPythonNode
(
'allreduce
_op1'
,
profiler
.
TracerEventType
.
Operator
,
105
,
108
,
1000
,
1001
)
reduce_all
_op1_infershape
=
HostPythonNode
(
'
reduce_all_op1::infershape'
,
profiler
.
TracerEventType
.
OperatorInner
,
105
,
106
,
1000
,
1001
)
allreduce
_op1_infershape
=
HostPythonNode
(
'
allreduce_op1::infershape'
,
profiler
.
TracerEventType
.
OperatorInner
,
105
,
106
,
1000
,
1001
)
reduce_all
_launchkernel1
=
HostPythonNode
(
allreduce
_launchkernel1
=
HostPythonNode
(
'cudalaunchkernel'
,
profiler
.
TracerEventType
.
CudaRuntime
,
106
,
107
,
1000
,
1001
)
nccl_
reduce_all
_kernel1
=
DevicePythonNode
(
'nccl_
reduce_all
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
130
,
150
,
nccl_
allreduce
_kernel1
=
DevicePythonNode
(
'nccl_
allreduce
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
130
,
150
,
0
,
0
,
2
)
backward_node
=
HostPythonNode
(
'Gradient Backward'
,
...
...
@@ -305,19 +307,19 @@ class TestProfilerStatistic(unittest.TestCase):
'sync_batch_norm_memcpy'
,
profiler
.
TracerEventType
.
Memcpy
,
150
,
200
,
0
,
0
,
1
)
reduce_all_node2
=
HostPythonNode
(
'reduce_all
'
,
profiler
.
TracerEventType
.
Operator
,
230
,
250
,
1000
,
1001
)
allreduce_node2
=
HostPythonNode
(
'allreduce
'
,
profiler
.
TracerEventType
.
Operator
,
230
,
250
,
1000
,
1001
)
reduce_all
_node2_infershape
=
HostPythonNode
(
'
reduce_all
_node2::infershape'
,
allreduce
_node2_infershape
=
HostPythonNode
(
'
allreduce
_node2::infershape'
,
profiler
.
TracerEventType
.
OperatorInner
,
231
,
232
,
1000
,
1001
)
reduce_all
_launchkernel2
=
HostPythonNode
(
allreduce
_launchkernel2
=
HostPythonNode
(
'cudalaunchkernel'
,
profiler
.
TracerEventType
.
CudaRuntime
,
235
,
240
,
1000
,
1001
)
nccl_
reduce_all
_kernel2
=
DevicePythonNode
(
'nccl_
reduce_all
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
250
,
280
,
nccl_
allreduce
_kernel2
=
DevicePythonNode
(
'nccl_
allreduce
_kernel'
,
profiler
.
TracerEventType
.
Kernel
,
250
,
280
,
0
,
0
,
2
)
root_node
.
children_node
.
append
(
profilerstep_node
)
...
...
@@ -329,12 +331,12 @@ class TestProfilerStatistic(unittest.TestCase):
yolonet_node
.
children_node
.
extend
(
[
sync_batch_norm_node
,
userdefined_node
])
userdefined_node
.
children_node
.
append
(
communication_node
)
userdefined_node
.
runtime_node
.
append
(
reduce_all
_launchkernel0
)
reduce_all_launchkernel0
.
device_node
.
append
(
nccl_reduce_all
_kernel0
)
communication_node
.
children_node
.
append
(
reduce_all
_op1
)
reduce_all_op1
.
children_node
.
append
(
reduce_all
_op1_infershape
)
reduce_all_op1
.
runtime_node
.
append
(
reduce_all
_launchkernel1
)
reduce_all_launchkernel1
.
device_node
.
append
(
nccl_reduce_all
_kernel1
)
userdefined_node
.
runtime_node
.
append
(
allreduce
_launchkernel0
)
allreduce_launchkernel0
.
device_node
.
append
(
nccl_allreduce
_kernel0
)
communication_node
.
children_node
.
append
(
allreduce
_op1
)
allreduce_op1
.
children_node
.
append
(
allreduce
_op1_infershape
)
allreduce_op1
.
runtime_node
.
append
(
allreduce
_launchkernel1
)
allreduce_launchkernel1
.
device_node
.
append
(
nccl_allreduce
_kernel1
)
conv2d_node
.
children_node
.
extend
(
[
conv2d_infer_shape
,
conv2d_compute
,
conv2d_MemCpy
])
conv2d_compute
.
runtime_node
.
append
(
conv2d_launchkernel
)
...
...
@@ -350,10 +352,10 @@ class TestProfilerStatistic(unittest.TestCase):
sync_batch_norm_MemCpy
.
runtime_node
.
append
(
sync_batch_norm_cudaMemCpy
)
sync_batch_norm_launchkernel
.
device_node
.
append
(
sync_batch_norm_kernel
)
sync_batch_norm_cudaMemCpy
.
device_node
.
append
(
sync_batch_norm_memcpy
)
optimization_node
.
children_node
.
append
(
reduce_all
_node2
)
reduce_all_node2
.
children_node
.
append
(
reduce_all
_node2_infershape
)
reduce_all_node2
.
runtime_node
.
append
(
reduce_all
_launchkernel2
)
reduce_all_launchkernel2
.
device_node
.
append
(
nccl_reduce_all
_kernel2
)
optimization_node
.
children_node
.
append
(
allreduce
_node2
)
allreduce_node2
.
children_node
.
append
(
allreduce
_node2_infershape
)
allreduce_node2
.
runtime_node
.
append
(
allreduce
_launchkernel2
)
allreduce_launchkernel2
.
device_node
.
append
(
nccl_allreduce
_kernel2
)
thread_tree
=
{
'thread1001'
:
root_node
}
extra_info
=
{
'Process Cpu Utilization'
:
'1.02'
,
...
...
@@ -415,20 +417,22 @@ class TestProfilerStatistic(unittest.TestCase):
distributed_summary
.
overlap_range
),
85
)
self
.
assertEqual
(
len
(
event_summary
.
items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
userdefined_items
),
1
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
3
)
self
.
assertEqual
(
len
(
event_summary
.
model_perspective_items
),
4
)
self
.
assertEqual
(
len
(
event_summary
.
memory_manipulation_items
),
1
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
gpu_time
,
25
)
self
.
assertEqual
(
event_summary
.
items
[
'conv2d'
].
g
eneral_g
pu_time
,
25
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
cpu_time
,
100
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Forward'
].
gpu_time
,
315
)
event_summary
.
model_perspective_items
[
'Forward'
].
general_gpu_time
,
315
)
self
.
assertEqual
(
event_summary
.
model_perspective_items
[
'Backward'
].
gpu_time
,
0
)
event_summary
.
model_perspective_items
[
'Backward'
].
general_gpu_time
,
0
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
].
cpu_time
,
15
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
].
gpu_time
,
60
)
self
.
assertEqual
(
event_summary
.
memory_manipulation_items
[
'AsyncMemcpy'
]
.
general_
gpu_time
,
60
)
print
(
profiler
.
profiler_statistic
.
_build_table
(
statistic_data
,
...
...
python/paddle/profiler/profiler_statistic.py
浏览文件 @
f43af275
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录