Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0a5fbb06
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0a5fbb06
编写于
12月 29, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refine code struct.
上级
f03e73c8
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
154 addition
and
150 deletion
+154
-150
paddle/platform/device_context.h
paddle/platform/device_context.h
+0
-12
paddle/platform/profiler.cc
paddle/platform/profiler.cc
+124
-25
paddle/platform/profiler.h
paddle/platform/profiler.h
+24
-107
paddle/platform/profiler_test.cc
paddle/platform/profiler_test.cc
+6
-6
未找到文件。
paddle/platform/device_context.h
浏览文件 @
0a5fbb06
...
...
@@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
cudnnHandle_t
cudnn_handle_
;
};
class
DeviceGuard
{
public:
explicit
DeviceGuard
(
int
device
)
{
original_device_
=
platform
::
GetCurrentDeviceId
();
platform
::
SetDeviceId
(
device
);
}
~
DeviceGuard
()
{
platform
::
SetDeviceId
(
original_device_
);
}
private:
int
original_device_
;
};
#endif
/*! \brief device context pool singleton */
...
...
paddle/platform/profiler.cc
浏览文件 @
0a5fbb06
...
...
@@ -17,34 +17,133 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
ProfilerState
kState
=
ProfilerState
::
kDisabled
;
uint32_t
kNextThreadId
=
0
;
std
::
mutex
kAllEventListsMutex
;
std
::
list
<
std
::
shared_ptr
<
EventList
>>
kAllEventLists
;
thread_local
std
::
shared_ptr
<
EventList
>
kEventList
;
thread_local
int32_t
kThreadId
;
// The profiler state, the initial value is ProfilerState::kDisabled
static
ProfilerState
g_state
=
ProfilerState
::
kDisabled
;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static
thread_local
int32_t
g_thread_id
;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static
uint32_t
g_next_thread_id
=
0
;
// The global mutex
static
std
::
mutex
g_all_event_lists_mutex
;
// The total event lists of all threads
static
std
::
list
<
std
::
shared_ptr
<
EventList
>>
g_all_event_lists
;
// The thread local event list only can be accessed by the specific thread
static
thread_local
std
::
shared_ptr
<
EventList
>
g_event_list
;
inline
uint64_t
GetTimeInNsec
()
{
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
std
::
chrono
::
high_resolution_clock
,
std
::
chrono
::
steady_clock
>::
type
;
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
clock
::
now
().
time_since_epoch
())
.
count
();
}
Event
::
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
DeviceContext
*
dev_ctx
)
:
kind_
(
kind
),
name_
(
std
::
move
(
name
)),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
PADDLE_ENFORCE
(
cudaGetDevice
(
&
device_
));
PADDLE_ENFORCE
(
cudaEventCreate
(
&
event_
));
auto
stream
=
cuda_dev_ctx
->
stream
();
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
stream
));
has_cuda_
=
true
;
}
#endif
cpu_ns_
=
GetTimeInNsec
();
}
std
::
string
Event
::
kind
()
const
{
switch
(
kind_
)
{
case
EventKind
::
kMark
:
return
"mark"
;
case
EventKind
::
kPushRange
:
return
"push"
;
case
EventKind
::
kPopRange
:
return
"pop"
;
}
PADDLE_THROW
(
"Unknown EventKind."
);
}
double
Event
::
CpuElapsedUs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000.0
);
}
double
Event
::
CudaElapsedUs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
e
.
has_cuda
()
&&
has_cuda
());
PADDLE_ENFORCE
(
e
.
device
()
==
device
());
PADDLE_ENFORCE
(
cudaEventSynchronize
(
event_
));
PADDLE_ENFORCE
(
cudaEventSynchronize
(
e
.
event
()));
float
ms
;
PADDLE_ENFORCE
(
cudaEventElapsedTime
(
&
ms
,
event_
,
e
.
event
()));
return
ms
*
1000.0
;
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
#endif
}
#ifdef PADDLE_WITH_CUDA
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
auto
original_device
=
GetCurrentDeviceId
();
int
count
=
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetDeviceId
(
i
);
func
(
i
);
}
SetDeviceId
(
original_device
);
}
#endif
inline
EventList
&
GetEventList
()
{
if
(
!
g_event_list
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
g_event_list
=
std
::
make_shared
<
EventList
>
();
g_thread_id
=
g_next_thread_id
++
;
g_all_event_lists
.
emplace_front
(
g_event_list
);
}
return
*
g_event_list
;
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
GetEventList
().
Record
(
EventKind
::
kPushRange
,
std
::
move
(
name
),
g_thread_id
,
dev_ctx_
);
}
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
GetEventList
().
Record
(
EventKind
::
kPopRange
,
std
::
string
(),
g_thread_id
,
dev_ctx_
);
}
void
EnableProfiler
(
ProfilerState
state
)
{
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"ProfilerState::kDisabled"
);
PADDLE_ENFORCE
(
kS
tate
==
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
g_s
tate
==
ProfilerState
::
kDisabled
,
"The profiling state should be disabled when calling "
,
"EnableProfiler."
);
kS
tate
=
state
;
g_s
tate
=
state
;
#ifdef PADDLE_WITH_CUDA
auto
ForEachDevice
=
[](
std
::
function
<
void
(
int
)
>
op
)
{
int
count
=
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
DeviceGuard
dev_guard
(
i
);
op
(
i
);
}
};
if
(
kState
==
ProfilerState
::
kCUDA
)
{
if
(
g_state
==
ProfilerState
::
kCUDA
)
{
// Generate some dummy evenets first to reduce the startup overhead.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ForEachDevice
([](
int
d
)
{
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
GPU
Place
(
d
));
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDA
Place
(
d
));
Mark
(
"_cuda_startup_"
,
dev_ctx
);
dev_ctx
->
Wait
();
});
...
...
@@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) {
}
#endif
// Mark the profiling start.
Mark
(
"_start_profiler_"
);
Mark
(
"_start_profiler_"
,
nullptr
);
}
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
()
{
PADDLE_ENFORCE
(
kS
tate
!=
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
g_s
tate
!=
ProfilerState
::
kDisabled
,
"Can't disable profiling, since it's not starting."
);
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
);
kS
tate
=
ProfilerState
::
kDisabled
;
Mark
(
"_stop_profiler_"
,
nullptr
);
g_s
tate
=
ProfilerState
::
kDisabled
;
std
::
vector
<
std
::
vector
<
Event
>>
result
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
kAllEventListsM
utex
);
for
(
auto
it
=
kAllEventLists
.
begin
();
it
!=
kAllEventLists
.
end
();
++
it
)
{
auto
&
list
=
*
it
;
result
.
emplace_back
(
list
->
Reduce
());
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_m
utex
);
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
result
.
emplace_back
(
(
*
it
)
->
Reduce
());
}
return
result
;
}
...
...
paddle/platform/profiler.h
浏览文件 @
0a5fbb06
...
...
@@ -24,76 +24,24 @@ namespace platform {
enum
EventKind
{
kMark
,
kPushRange
,
kPopRange
};
inline
uint64_t
GetTimeInNsec
()
{
// using std::chrono;
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
std
::
chrono
::
high_resolution_clock
,
std
::
chrono
::
steady_clock
>::
type
;
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
clock
::
now
().
time_since_epoch
())
.
count
();
}
class
Event
{
public:
// the DeviceContext is used to get the cuda stream.
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
const
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
:
kind_
(
kind
),
name_
(
std
::
move
(
name
)),
thread_id_
(
thread_id
)
{
has_cuda_
=
false
;
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
PADDLE_ENFORCE
(
cudaGetDevice
(
&
device_
));
PADDLE_ENFORCE
(
cudaEventCreate
(
&
event_
));
auto
stream
=
cuda_dev_ctx
->
stream
();
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
stream
));
has_cuda_
=
true
;
}
#endif
cpu_ns_
=
GetTimeInNsec
();
}
std
::
string
kind
()
const
{
switch
(
kind_
)
{
case
EventKind
::
kMark
:
return
"mark"
;
case
EventKind
::
kPushRange
:
return
"push"
;
case
EventKind
::
kPopRange
:
return
"pop"
;
}
PADDLE_THROW
(
"Unknown EventKind."
);
}
DeviceContext
*
dev_ctx
);
std
::
string
kind
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
bool
has_cuda
()
const
{
return
has_cuda_
;
}
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
double
CpuElapsedUs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000.0
);
}
double
CudaElapsedUs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
e
.
has_cuda
()
&&
has_cuda
());
PADDLE_ENFORCE
(
e
.
device
()
==
device
());
PADDLE_ENFORCE
(
cudaEventSynchronize
(
event_
));
PADDLE_ENFORCE
(
cudaEventSynchronize
(
e
.
event
()));
float
ms
;
PADDLE_ENFORCE
(
cudaEventElapsedTime
(
&
ms
,
event_
,
e
.
event
()));
return
ms
*
1000.0
;
#else
PADDLE_THROW
(
"CUDA is not enabled"
);
#endif
}
double
CpuElapsedUs
(
const
Event
&
e
)
const
;
double
CudaElapsedUs
(
const
Event
&
e
)
const
;
private:
EventKind
kind_
;
...
...
@@ -108,11 +56,11 @@ class Event {
};
struct
EventList
{
constexpr
static
s
td
::
s
ize_t
kMB
=
1024
*
1024
;
constexpr
static
s
td
::
s
ize_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
s
td
::
s
ize_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
s
td
::
s
ize_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
s
td
::
s
ize_t
kNumBlock
=
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
size_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
...
...
@@ -139,58 +87,27 @@ struct EventList {
};
enum
ProfilerState
{
kDisabled
,
kCPU
,
kCUDA
,
kDisabled
,
// disabled state
kCPU
,
// CPU profiling state
kCUDA
,
// GPU profiling state
};
// The profiler state, the initial value is ProfilerState::kDisabled
extern
ProfilerState
kState
;
// The global mutex
extern
std
::
mutex
kAllEventListsMutex
;
// The total event lists of all threads
extern
std
::
list
<
std
::
shared_ptr
<
EventList
>>
kAllEventLists
;
// The thread local event list only can be accessed by the specific thread
extern
thread_local
std
::
shared_ptr
<
EventList
>
kEventList
;
// The thread index of each thread
extern
thread_local
int32_t
kThreadId
;
// The kNextThreadId is a global counter for threads, by the kThreadId and
// kNextThreadId, we can know how many threads have created EventList.
extern
uint32_t
kNextThreadId
;
inline
EventList
&
GetEventList
()
{
if
(
!
kEventList
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
kAllEventListsMutex
);
kEventList
=
std
::
make_shared
<
EventList
>
();
kThreadId
=
kNextThreadId
++
;
kAllEventLists
.
emplace_front
(
kEventList
);
}
return
*
kEventList
;
}
inline
void
Mark
(
const
std
::
string
name
,
const
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
std
::
move
(
name
),
kThreadId
,
dev_ctx
);
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
struct
RecordEvent
{
explicit
RecordEvent
(
const
std
::
string
name
,
platform
::
DeviceContext
*
dev_ctx
=
nullptr
)
{
if
(
kState
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
GetEventList
().
Record
(
EventKind
::
kPushRange
,
std
::
move
(
name
),
kThreadId
,
dev_ctx_
);
}
explicit
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
~
RecordEvent
()
{
if
(
kState
==
ProfilerState
::
kDisabled
)
return
;
GetEventList
().
Record
(
EventKind
::
kPopRange
,
std
::
string
(),
kThreadId
,
dev_ctx_
);
}
platform
::
DeviceContext
*
dev_ctx_
;
~
RecordEvent
();
// The device context is used by Event to get the current cuda stream.
DeviceContext
*
dev_ctx_
;
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
();
}
// namespace platform
...
...
paddle/platform/profiler_test.cc
浏览文件 @
0a5fbb06
...
...
@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventKind
;
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
);
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
,
nullptr
);
EXPECT_TRUE
(
start_event
.
has_cuda
()
==
false
);
int
counter
=
0
;
while
(
counter
!=
1000
)
{
counter
++
;
}
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
);
Event
stop_event
(
EventKind
::
kPopRange
,
"test"
,
0
,
nullptr
);
EXPECT_GT
(
start_event
.
CpuElapsedUs
(
stop_event
),
0
);
}
...
...
@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
TEST
(
Event
,
CudaElapsedTime
)
{
using
paddle
::
platform
::
DeviceContext
;
using
paddle
::
platform
::
CUDADeviceContext
;
using
paddle
::
platform
::
GPU
Place
;
using
paddle
::
platform
::
CUDA
Place
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventKind
;
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
GPU
Place
(
0
));
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDA
Place
(
0
));
Event
start_event
(
EventKind
::
kPushRange
,
"test"
,
0
,
dev_ctx
);
EXPECT_TRUE
(
start_event
.
has_cuda
()
==
true
);
int
counter
=
0
;
...
...
@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
DeviceContext
*
dev_ctx
=
nullptr
;
#ifdef PADDLE_WITH_CUDA
using
paddle
::
platform
::
CUDADeviceContext
;
using
paddle
::
platform
::
GPU
Place
;
using
paddle
::
platform
::
CUDA
Place
;
state
=
ProfilerState
::
kCUDA
;
dev_ctx
=
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
GPU
Place
(
0
));
new
paddle
::
platform
::
CUDADeviceContext
(
paddle
::
platform
::
CUDA
Place
(
0
));
#endif
EnableProfiler
(
state
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录