Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5e90f5e1
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5e90f5e1
编写于
1月 08, 2018
作者:
Y
Yibing Liu
提交者:
GitHub
1月 08, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #7043 from kuke/profiler_tool
Add the parsing part for the profiling tool
上级
c62383e9
6416eee7
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
238 addition
and
21 deletion
+238
-21
paddle/platform/profiler.cc
paddle/platform/profiler.cc
+170
-14
paddle/platform/profiler.h
paddle/platform/profiler.h
+30
-2
paddle/platform/profiler_test.cc
paddle/platform/profiler_test.cc
+38
-5
未找到文件。
paddle/platform/profiler.cc
浏览文件 @
5e90f5e1
...
...
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/profiler.h"
#include <iomanip>
#include <map>
#include "glog/logging.h"
namespace
paddle
{
namespace
platform
{
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
// To record which timer the profiler used, CUDA or CPU.
static
std
::
string
g_profiler_place
=
""
;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static
thread_local
int32_t
g_thread_id
;
...
...
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
Event
::
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
DeviceContext
*
dev_ctx
)
:
kind_
(
kind
),
name_
(
std
::
move
(
name
)),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
:
kind_
(
kind
),
name_
(
name
),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
...
...
@@ -72,11 +74,11 @@ std::string Event::kind() const {
PADDLE_THROW
(
"Unknown EventKind."
);
}
double
Event
::
CpuElapsed
U
s
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000.0
);
double
Event
::
CpuElapsed
M
s
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000
000
.0
);
}
double
Event
::
CudaElapsed
U
s
(
const
Event
&
e
)
const
{
double
Event
::
CudaElapsed
M
s
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
e
.
has_cuda
()
&&
has_cuda
());
PADDLE_ENFORCE
(
e
.
device
()
==
device
());
...
...
@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
PADDLE_ENFORCE
(
cudaEventSynchronize
(
e
.
event
()));
float
ms
;
PADDLE_ENFORCE
(
cudaEventElapsedTime
(
&
ms
,
event_
,
e
.
event
()));
return
ms
*
1000.0
;
return
ms
;
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
#endif
...
...
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx
);
GetEventList
().
Record
(
EventKind
::
kMark
,
name
,
g_thread_id
,
dev_ctx
);
}
void
PushEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kPushRange
,
name
,
g_thread_id
,
dev_ctx
);
}
void
PopEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kPopRange
,
name
,
g_thread_id
,
dev_ctx
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
GetEventList
().
Record
(
EventKind
::
kPushRange
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx_
);
name_
=
name
;
PushEvent
(
name_
,
dev_ctx_
);
}
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
GetEventList
().
Record
(
EventKind
::
kPopRange
,
std
::
string
(),
g_thread_id
,
dev_ctx_
);
PopEvent
(
name_
,
dev_ctx_
);
}
void
EnableProfiler
(
ProfilerState
state
)
{
...
...
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling "
,
"EnableProfiler."
);
g_state
=
state
;
g_profiler_place
=
(
g_state
==
ProfilerState
::
kCUDA
)
?
"CUDA"
:
"CPU"
;
#ifdef PADDLE_WITH_CUDA
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
// Generate some dummy evenets first to reduce the startup overhead.
...
...
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
return
result
;
}
void
ParseEvents
(
std
::
vector
<
std
::
vector
<
Event
>>&
events
,
EventSortingKey
sorted_by
)
{
if
(
g_profiler_place
==
""
)
return
;
std
::
string
sorted_domain
;
std
::
function
<
bool
(
EventItem
&
,
EventItem
&
)
>
sorted_func
;
switch
(
sorted_by
)
{
case
EventSortingKey
::
kCalls
:
sorted_domain
=
"number of calls"
;
sorted_func
=
[](
EventItem
&
a
,
EventItem
&
b
)
{
return
a
.
calls
>
b
.
calls
;
};
break
;
case
EventSortingKey
::
kTotal
:
sorted_domain
=
"total time"
;
sorted_func
=
[](
EventItem
&
a
,
EventItem
&
b
)
{
return
a
.
total_time
>
b
.
total_time
;
};
break
;
case
EventSortingKey
::
kMin
:
sorted_domain
=
"minimum time"
;
sorted_func
=
[](
EventItem
&
a
,
EventItem
&
b
)
{
return
a
.
min_time
>
b
.
min_time
;
};
break
;
case
EventSortingKey
::
kMax
:
sorted_domain
=
"maximum time"
;
sorted_func
=
[](
EventItem
&
a
,
EventItem
&
b
)
{
return
a
.
max_time
>
b
.
max_time
;
};
break
;
case
EventSortingKey
::
kAve
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
EventItem
&
a
,
EventItem
&
b
)
{
return
a
.
ave_time
>
b
.
ave_time
;
};
break
;
default:
sorted_domain
=
"event end time"
;
}
std
::
vector
<
std
::
vector
<
EventItem
>>
events_table
;
size_t
max_name_width
=
0
;
for
(
size_t
i
=
0
;
i
<
events
.
size
();
i
++
)
{
std
::
list
<
Event
>
pushed_events
;
std
::
vector
<
EventItem
>
event_items
;
std
::
unordered_map
<
std
::
string
,
int
>
event_idx
;
for
(
size_t
j
=
0
;
j
<
events
[
i
].
size
();
j
++
)
{
if
(
events
[
i
][
j
].
kind
()
==
"push"
)
{
pushed_events
.
push_back
(
events
[
i
][
j
]);
}
else
if
(
events
[
i
][
j
].
kind
()
==
"pop"
)
{
std
::
list
<
Event
>::
reverse_iterator
rit
=
pushed_events
.
rbegin
();
while
(
rit
!=
pushed_events
.
rend
()
&&
rit
->
name
()
!=
events
[
i
][
j
].
name
())
{
++
rit
;
}
if
(
rit
!=
pushed_events
.
rend
())
{
double
event_time
=
(
g_profiler_place
==
"CUDA"
)
?
rit
->
CudaElapsedMs
(
events
[
i
][
j
])
:
rit
->
CpuElapsedMs
(
events
[
i
][
j
]);
std
::
string
event_name
=
"thread"
+
std
::
to_string
(
rit
->
thread_id
())
+
"::"
+
rit
->
name
();
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
if
(
event_idx
.
find
(
event_name
)
==
event_idx
.
end
())
{
event_idx
[
event_name
]
=
event_items
.
size
();
EventItem
event_item
=
{
event_name
,
1
,
event_time
,
event_time
,
event_time
,
event_time
};
event_items
.
push_back
(
event_item
);
}
else
{
int
index
=
event_idx
[
event_name
];
event_items
[
index
].
calls
+=
1
;
// total time
event_items
[
index
].
total_time
+=
event_time
;
// min time
event_items
[
index
].
min_time
=
std
::
min
(
event_time
,
event_items
[
index
].
min_time
);
// max time
event_items
[
index
].
max_time
=
std
::
max
(
event_time
,
event_items
[
index
].
max_time
);
}
// remove the push marker from the list
pushed_events
.
erase
((
++
rit
).
base
());
}
else
{
LOG
(
WARNING
)
<<
"Cannot find the push marker of event
\'
"
<<
events
[
i
][
j
].
name
()
<<
"
\'
, which will be ignored in profiling report."
;
}
}
}
// average time
for
(
auto
&
item
:
event_items
)
{
item
.
ave_time
=
item
.
total_time
/
item
.
calls
;
}
// sort
if
(
sorted_by
!=
EventSortingKey
::
kDefault
)
{
std
::
sort
(
event_items
.
begin
(),
event_items
.
end
(),
sorted_func
);
}
events_table
.
push_back
(
event_items
);
// log warning if there are events with `push` but without `pop`
std
::
list
<
Event
>::
reverse_iterator
rit
=
pushed_events
.
rbegin
();
while
(
rit
!=
pushed_events
.
rend
())
{
LOG
(
WARNING
)
<<
"Cannot find the pop marker of event
\'
"
<<
rit
->
name
()
<<
"
\'
, which will be ignored in profiling report."
;
++
rit
;
}
}
// Print report
PrintProfilingReport
(
events_table
,
sorted_domain
,
max_name_width
+
4
,
12
);
}
void
PrintProfilingReport
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
<<
" Profiling Report "
<<
"<-------------------------
\n\n
"
;
std
::
cout
<<
"Place: "
<<
g_profiler_place
<<
std
::
endl
;
std
::
cout
<<
"Time unit: ms"
<<
std
::
endl
;
std
::
cout
<<
"Sorted by "
<<
sorted_domain
<<
" in descending order in the same thread
\n\n
"
;
// Output events table
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
<<
"Calls"
<<
std
::
setw
(
data_width
)
<<
"Total"
<<
std
::
setw
(
data_width
)
<<
"Min."
<<
std
::
setw
(
data_width
)
<<
"Max."
<<
std
::
setw
(
data_width
)
<<
"Ave."
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
events_table
[
i
].
size
();
++
j
)
{
EventItem
&
event_item
=
events_table
[
i
][
j
];
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
max_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ave_time
<<
std
::
endl
;
}
}
std
::
cout
<<
std
::
endl
;
}
}
// namespace platform
}
// namespace paddle
paddle/platform/profiler.h
浏览文件 @
5e90f5e1
...
...
@@ -33,6 +33,7 @@ class Event {
std
::
string
kind
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
bool
has_cuda
()
const
{
return
has_cuda_
;
}
#ifdef PADDLE_WITH_CUDA
...
...
@@ -40,8 +41,8 @@ class Event {
int
device
()
const
{
return
device_
;
}
#endif
double
CpuElapsed
U
s
(
const
Event
&
e
)
const
;
double
CudaElapsed
U
s
(
const
Event
&
e
)
const
;
double
CpuElapsed
M
s
(
const
Event
&
e
)
const
;
double
CudaElapsed
M
s
(
const
Event
&
e
)
const
;
private:
EventKind
kind_
;
...
...
@@ -94,6 +95,10 @@ enum ProfilerState {
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
void
PushEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
void
PopEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
struct
RecordEvent
{
explicit
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
...
...
@@ -101,6 +106,8 @@ struct RecordEvent {
// The device context is used by Event to get the current cuda stream.
DeviceContext
*
dev_ctx_
;
// Event name
std
::
string
name_
;
};
// Enable the profiling function.
...
...
@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
();
// The information of each event given in the profiling report
struct
EventItem
{
std
::
string
name
;
int
calls
;
double
total_time
;
double
min_time
;
double
max_time
;
double
ave_time
;
};
// Candidate keys to sort the profiling report
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
};
// Parse the event list and output the profiling report
void
ParseEvents
(
std
::
vector
<
std
::
vector
<
Event
>>&
,
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
);
// Print results
void
PrintProfilingReport
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
);
}
// namespace platform
}
// namespace paddle
paddle/platform/profiler_test.cc
浏览文件 @
5e90f5e1
...
...
@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
counter
++
;
}
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
,
nullptr
);
EXPECT_GT
(
start_event
.
CpuElapsed
U
s
(
stop_event
),
0
);
EXPECT_GT
(
start_event
.
CpuElapsed
M
s
(
stop_event
),
0
);
}
#ifdef PADDLE_WITH_CUDA
...
...
@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
counter
++
;
}
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
,
dev_ctx
);
EXPECT_GT
(
start_event
.
CudaElapsed
U
s
(
stop_event
),
0
);
EXPECT_GT
(
start_event
.
CudaElapsed
M
s
(
stop_event
),
0
);
}
#endif
...
...
@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
using
paddle
::
platform
::
EventKind
;
using
paddle
::
platform
::
RecordEvent
;
using
paddle
::
platform
::
ProfilerState
;
using
paddle
::
platform
::
EventSortingKey
;
ProfilerState
state
=
ProfilerState
::
kCPU
;
DeviceContext
*
dev_ctx
=
nullptr
;
...
...
@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
#endif
EnableProfiler
(
state
);
/* Usage 1:
* PushEvent(evt_name, dev_ctx);
* ...
* code to be analyzed
* ...
* PopEvent(evt_name, dev_ctx);
*/
for
(
int
loop
=
0
;
loop
<
3
;
++
loop
)
{
for
(
int
i
=
1
;
i
<
5
;
++
i
)
{
std
::
string
name
=
"op_"
+
std
::
to_string
(
i
);
PushEvent
(
name
,
dev_ctx
);
int
counter
=
1
;
while
(
counter
!=
i
*
1000
)
counter
++
;
PopEvent
(
name
,
dev_ctx
);
}
}
/* Usage 2:
* {
* RecordEvent record_event(name, dev_ctx);
* ...
* code to be analyzed
* ...
* }
*/
for
(
int
i
=
1
;
i
<
5
;
++
i
)
{
std
::
string
name
=
"op_"
+
std
::
to_string
(
i
);
std
::
string
name
=
"
evs_
op_"
+
std
::
to_string
(
i
);
RecordEvent
record_event
(
name
,
dev_ctx
);
int
counter
=
1
;
while
(
counter
!=
i
*
1000
)
counter
++
;
}
// Bad Usage:
PushEvent
(
"event_without_pop"
,
dev_ctx
);
PopEvent
(
"event_without_push"
,
dev_ctx
);
std
::
vector
<
std
::
vector
<
Event
>>
events
=
paddle
::
platform
::
DisableProfiler
();
// Will remove parsing-related code from test later
ParseEvents
(
events
,
EventSortingKey
::
kTotal
);
int
cuda_startup_count
=
0
;
int
start_profiler_count
=
0
;
int
stop_profiler_count
=
0
;
...
...
@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
if
(
events
[
i
][
j
].
name
()
==
"push"
)
{
EXPECT_EQ
(
events
[
i
][
j
+
1
].
name
(),
"pop"
);
#ifdef PADDLE_WITH_CUDA
EXPECT_GT
(
events
[
i
][
j
].
CudaElapsed
U
s
(
events
[
i
][
j
+
1
]),
0
);
EXPECT_GT
(
events
[
i
][
j
].
CudaElapsed
M
s
(
events
[
i
][
j
+
1
]),
0
);
#else
EXPECT_GT
(
events
[
i
][
j
].
CpuElapsed
U
s
(
events
[
i
][
j
+
1
]),
0
);
EXPECT_GT
(
events
[
i
][
j
].
CpuElapsed
M
s
(
events
[
i
][
j
+
1
]),
0
);
#endif
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录