Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
0a5fbb06
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0a5fbb06
编写于
12月 29, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refine code struct.
上级
f03e73c8
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
154 addition
and
150 deletion
+154
-150
paddle/platform/device_context.h
paddle/platform/device_context.h
+0
-12
paddle/platform/profiler.cc
paddle/platform/profiler.cc
+124
-25
paddle/platform/profiler.h
paddle/platform/profiler.h
+24
-107
paddle/platform/profiler_test.cc
paddle/platform/profiler_test.cc
+6
-6
未找到文件。
paddle/platform/device_context.h
浏览文件 @
0a5fbb06
...
...
@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
cudnnHandle_t
cudnn_handle_
;
};
class
DeviceGuard
{
public:
explicit
DeviceGuard
(
int
device
)
{
original_device_
=
platform
::
GetCurrentDeviceId
();
platform
::
SetDeviceId
(
device
);
}
~
DeviceGuard
()
{
platform
::
SetDeviceId
(
original_device_
);
}
private:
int
original_device_
;
};
#endif
/*! \brief device context pool singleton */
...
...
paddle/platform/profiler.cc
浏览文件 @
0a5fbb06
...
...
@@ -17,34 +17,133 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
ProfilerState
kState
=
ProfilerState
::
kDisabled
;
uint32_t
kNextThreadId
=
0
;
std
::
mutex
kAllEventListsMutex
;
std
::
list
<
std
::
shared_ptr
<
EventList
>>
kAllEventLists
;
thread_local
std
::
shared_ptr
<
EventList
>
kEventList
;
thread_local
int32_t
kThreadId
;
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static
thread_local
int32_t
g_thread_id
;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static
uint32_t
g_next_thread_id
=
0
;
// The global mutex
static
std
::
mutex
g_all_event_lists_mutex
;
// The total event lists of all threads
static
std
::
list
<
std
::
shared_ptr
<
EventList
>>
g_all_event_lists
;
// The thread local event list only can be accessed by the specific thread
static
thread_local
std
::
shared_ptr
<
EventList
>
g_event_list
;
inline
uint64_t
GetTimeInNsec
()
{
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
std
::
chrono
::
high_resolution_clock
,
std
::
chrono
::
steady_clock
>::
type
;
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
clock
::
now
().
time_since_epoch
())
.
count
();
}
Event
::
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
DeviceContext
*
dev_ctx
)
:
kind_
(
kind
),
name_
(
std
::
move
(
name
)),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
PADDLE_ENFORCE
(
cudaGetDevice
(
&
device_
));
PADDLE_ENFORCE
(
cudaEventCreate
(
&
event_
));
auto
stream
=
cuda_dev_ctx
->
stream
();
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
stream
));
has_cuda_
=
true
;
}
#endif
cpu_ns_
=
GetTimeInNsec
();
}
std
::
string
Event
::
kind
()
const
{
switch
(
kind_
)
{
case
EventKind
::
kMark
:
return
"mark"
;
case
EventKind
::
kPushRange
:
return
"push"
;
case
EventKind
::
kPopRange
:
return
"pop"
;
}
PADDLE_THROW
(
"Unknown EventKind."
);
}
double
Event
::
CpuElapsedUs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000.0
);
}
double
Event
::
CudaElapsedUs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
e
.
has_cuda
()
&&
has_cuda
());
PADDLE_ENFORCE
(
e
.
device
()
==
device
());
PADDLE_ENFORCE
(
cudaEventSynchronize
(
event_
));
PADDLE_ENFORCE
(
cudaEventSynchronize
(
e
.
event
()));
float
ms
;
PADDLE_ENFORCE
(
cudaEventElapsedTime
(
&
ms
,
event_
,
e
.
event
()));
return
ms
*
1000.0
;
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
#endif
}
#ifdef PADDLE_WITH_CUDA
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
auto
original_device
=
GetCurrentDeviceId
();
int
count
=
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetDeviceId
(
i
);
func
(
i
);
}
SetDeviceId
(
original_device
);
}
#endif
inline
EventList
&
GetEventList
()
{
if
(
!
g_event_list
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
g_event_list
=
std
::
make_shared
<
EventList
>
();
g_thread_id
=
g_next_thread_id
++
;
g_all_event_lists
.
emplace_front
(
g_event_list
);
}
return
*
g_event_list
;
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
GetEventList
().
Record
(
EventKind
::
kPushRange
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx_
);
}
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
GetEventList
().
Record
(
EventKind
::
kPopRange
,
std
::
string
(),
g_thread_id
,
dev_ctx_
);
}
void
EnableProfiler
(
ProfilerState
state
)
{
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"ProfilerState::kDisabled"
);
PADDLE_ENFORCE
(
kS
tate
==
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
g_s
tate
==
ProfilerState
::
kDisabled
,
"The profiling state should be disabled when calling "
,
"EnableProfiler."
);
kS
tate
=
state
;
g_s
tate
=
state
;
#ifdef PADDLE_WITH_CUDA
auto
ForEachDevice
=
[](
std
::
function
<
void
(
int
)
>
op
)
{
int
count
=
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
DeviceGuard
dev_guard
(
i
);
op
(
i
);
}
};
if
(
kState
==
ProfilerState
::
kCUDA
)
{
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
// Generate some dummy evenets first to reduce the startup overhead.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ForEachDevice
([](
int
d
)
{
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
GPU
Place
(
d
));
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDA
Place
(
d
));
Mark
(
"_cuda_startup_"
,
dev_ctx
);
dev_ctx
->
Wait
();
});
...
...
@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
}
#endif
// Mark the profiling start.
Mark
(
"_start_profiler_"
);
Mark
(
"_start_profiler_"
,
nullptr
);
}
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
()
{
PADDLE_ENFORCE
(
kS
tate
!=
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
g_s
tate
!=
ProfilerState
::
kDisabled
,
"Can't disable profiling, since it's not starting."
);
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
);
kS
tate
=
ProfilerState
::
kDisabled
;
Mark
(
"_stop_profiler_"
,
nullptr
);
g_s
tate
=
ProfilerState
::
kDisabled
;
std
::
vector
<
std
::
vector
<
Event
>>
result
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
kAllEventListsM
utex
);
for
(
auto
it
=
kAllEventLists
.
begin
();
it
!=
kAllEventLists
.
end
();
++
it
)
{
auto
&
list
=
*
it
;
result
.
emplace_back
(
list
->
Reduce
());
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_m
utex
);
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
result
.
emplace_back
(
(
*
it
)
->
Reduce
());
}
return
result
;
}
...
...
paddle/platform/profiler.h
浏览文件 @
0a5fbb06
...
...
@@ -24,76 +24,24 @@ namespace platform {
enum
EventKind
{
kMark
,
kPushRange
,
kPopRange
};
inline
uint64_t
GetTimeInNsec
()
{
// using std::chrono;
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
std
::
chrono
::
high_resolution_clock
,
std
::
chrono
::
steady_clock
>::
type
;
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
clock
::
now
().
time_since_epoch
())
.
count
();
}
class
Event
{
public:
// the DeviceContext is used to get the cuda stream.
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
const
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
:
kind_
(
kind
),
name_
(
std
::
move
(
name
)),
thread_id_
(
thread_id
)
{
has_cuda_
=
false
;
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
PADDLE_ENFORCE
(
cudaGetDevice
(
&
device_
));
PADDLE_ENFORCE
(
cudaEventCreate
(
&
event_
));
auto
stream
=
cuda_dev_ctx
->
stream
();
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
stream
));
has_cuda_
=
true
;
}
#endif
cpu_ns_
=
GetTimeInNsec
();
}
std
::
string
kind
()
const
{
switch
(
kind_
)
{
case
EventKind
::
kMark
:
return
"mark"
;
case
EventKind
::
kPushRange
:
return
"push"
;
case
EventKind
::
kPopRange
:
return
"pop"
;
}
PADDLE_THROW
(
"Unknown EventKind."
);
}
DeviceContext
*
dev_ctx
);
std
::
string
kind
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
bool
has_cuda
()
const
{
return
has_cuda_
;
}
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
double
CpuElapsedUs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000.0
);
}
double
CudaElapsedUs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
e
.
has_cuda
()
&&
has_cuda
());
PADDLE_ENFORCE
(
e
.
device
()
==
device
());
PADDLE_ENFORCE
(
cudaEventSynchronize
(
event_
));
PADDLE_ENFORCE
(
cudaEventSynchronize
(
e
.
event
()));
float
ms
;
PADDLE_ENFORCE
(
cudaEventElapsedTime
(
&
ms
,
event_
,
e
.
event
()));
return
ms
*
1000.0
;
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
#endif
}
double
CpuElapsedUs
(
const
Event
&
e
)
const
;
double
CudaElapsedUs
(
const
Event
&
e
)
const
;
private:
EventKind
kind_
;
...
...
@@ -108,11 +56,11 @@ class Event {
};
struct
EventList
{
constexpr
static
s
td
::
s
ize_t
kMB
=
1024
*
1024
;
constexpr
static
s
td
::
s
ize_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
s
td
::
s
ize_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
s
td
::
s
ize_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
s
td
::
s
ize_t
kNumBlock
=
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
size_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
...
...
@@ -139,58 +87,27 @@ struct EventList {
};
enum
ProfilerState
{
kDisabled
,
kCPU
,
kCUDA
,
kDisabled
,
// disabled state
kCPU
,
// CPU profiling state
kCUDA
,
// GPU profiling state
};
// The profiler state, the initial value is ProfilerState::kDisabled
extern
ProfilerState
kState
;
// The global mutex
extern
std
::
mutex
kAllEventListsMutex
;
// The total event lists of all threads
extern
std
::
list
<
std
::
shared_ptr
<
EventList
>>
kAllEventLists
;
// The thread local event list only can be accessed by the specific thread
extern
thread_local
std
::
shared_ptr
<
EventList
>
kEventList
;
// The thread index of each thread
extern
thread_local
int32_t
kThreadId
;
// The kNextThreadId is a global counter for threads, by the kThreadId and
// kNextThreadId, we can know how many threads have created EventList.
extern
uint32_t
kNextThreadId
;
inline
EventList
&
GetEventList
()
{
if
(
!
kEventList
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
kAllEventListsMutex
);
kEventList
=
std
::
make_shared
<
EventList
>
();
kThreadId
=
kNextThreadId
++
;
kAllEventLists
.
emplace_front
(
kEventList
);
}
return
*
kEventList
;
}
inline
void
Mark
(
const
std
::
string
name
,
const
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
std
::
move
(
name
),
kThreadId
,
dev_ctx
);
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
struct
RecordEvent
{
explicit
RecordEvent
(
const
std
::
string
name
,
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
{
if
(
kState
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
GetEventList
().
Record
(
EventKind
::
kPushRange
,
std
::
move
(
name
),
kThreadId
,
dev_ctx_
);
}
explicit
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
~
RecordEvent
()
{
if
(
kState
==
ProfilerState
::
kDisabled
)
return
;
GetEventList
().
Record
(
EventKind
::
kPopRange
,
std
::
string
(),
kThreadId
,
dev_ctx_
);
}
platform
::
DeviceContext
*
dev_ctx_
;
~
RecordEvent
();
// The device context is used by Event to get the current cuda stream.
DeviceContext
*
dev_ctx_
;
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
();
}
// namespace platform
...
...
paddle/platform/profiler_test.cc
浏览文件 @
0a5fbb06
...
...
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventKind
;
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
);
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
,
nullptr
);
EXPECT_TRUE
(
start_event
.
has_cuda
()
==
false
);
int
counter
=
0
;
while
(
counter
!=
1000
)
{
counter
++
;
}
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
);
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
,
nullptr
);
EXPECT_GT
(
start_event
.
CpuElapsedUs
(
stop_event
),
0
);
}
...
...
@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
TEST
(
Event
,
CudaElapsedTime
)
{
using
paddle
::
platform
::
DeviceContext
;
using
paddle
::
platform
::
CUDADeviceContext
;
using
paddle
::
platform
::
GPU
Place
;
using
paddle
::
platform
::
CUDA
Place
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventKind
;
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
GPU
Place
(
0
));
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDA
Place
(
0
));
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
,
dev_ctx
);
EXPECT_TRUE
(
start_event
.
has_cuda
()
==
true
);
int
counter
=
0
;
...
...
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
DeviceContext
*
dev_ctx
=
nullptr
;
#ifdef PADDLE_WITH_CUDA
using
paddle
::
platform
::
CUDADeviceContext
;
using
paddle
::
platform
::
GPU
Place
;
using
paddle
::
platform
::
CUDA
Place
;
state
=
ProfilerState
::
kCUDA
;
dev_ctx
=
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
GPU
Place
(
0
));
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
CUDA
Place
(
0
));
#endif
EnableProfiler
(
state
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录