Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
4840c49b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4840c49b
编写于
3月 14, 2018
作者:
X
Xin Pan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Better timeline
上级
cbfd15f9
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
190 addition
and
65 deletion
+190
-65
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+8
-0
paddle/fluid/operators/parallel_do_op.cc
paddle/fluid/operators/parallel_do_op.cc
+17
-10
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+51
-23
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+23
-14
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+31
-2
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+18
-0
paddle/fluid/platform/profiler.proto
paddle/fluid/platform/profiler.proto
+6
-1
python/paddle/fluid/tests/unittests/test_profiler.py
python/paddle/fluid/tests/unittests/test_profiler.py
+16
-2
tools/timeline.py
tools/timeline.py
+20
-13
未找到文件。
paddle/fluid/framework/executor.cc
浏览文件 @
4840c49b
...
@@ -25,6 +25,7 @@ limitations under the License. */
...
@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
benchmark
);
DECLARE_bool
(
benchmark
);
DEFINE_bool
(
check_nan_inf
,
false
,
DEFINE_bool
(
check_nan_inf
,
false
,
...
@@ -33,6 +34,11 @@ DEFINE_bool(check_nan_inf, false,
...
@@ -33,6 +34,11 @@ DEFINE_bool(check_nan_inf, false,
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
{
// block id starts from 0. This id is used to represent the codeblock
// wrapping the first block 0.
int
kProgramId
=
-
1
;
}
// namespace
struct
ExecutorPrepareContext
{
struct
ExecutorPrepareContext
{
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
)
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
)
...
@@ -94,6 +100,7 @@ static void CheckTensorNANOrInf(const std::string& name,
...
@@ -94,6 +100,7 @@ static void CheckTensorNANOrInf(const std::string& name,
void
Executor
::
Run
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
int
block_id
,
void
Executor
::
Run
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
int
block_id
,
bool
create_local_scope
,
bool
create_vars
)
{
bool
create_local_scope
,
bool
create_vars
)
{
platform
::
RecordBlock
b
(
block_id
);
auto
*
ctx
=
Prepare
(
pdesc
,
block_id
);
auto
*
ctx
=
Prepare
(
pdesc
,
block_id
);
RunPreparedContext
(
ctx
,
scope
,
create_local_scope
,
create_vars
);
RunPreparedContext
(
ctx
,
scope
,
create_local_scope
,
create_vars
);
delete
ctx
;
delete
ctx
;
...
@@ -184,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
...
@@ -184,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std
::
map
<
std
::
string
,
LoDTensor
*>&
fetch_targets
,
std
::
map
<
std
::
string
,
LoDTensor
*>&
fetch_targets
,
const
std
::
string
&
feed_holder_name
,
const
std
::
string
&
feed_holder_name
,
const
std
::
string
&
fetch_holder_name
)
{
const
std
::
string
&
fetch_holder_name
)
{
platform
::
RecordBlock
b
(
kProgramId
);
auto
*
copy_program
=
new
ProgramDesc
(
program
);
auto
*
copy_program
=
new
ProgramDesc
(
program
);
auto
*
global_block
=
copy_program
->
MutableBlock
(
0
);
auto
*
global_block
=
copy_program
->
MutableBlock
(
0
);
...
...
paddle/fluid/operators/parallel_do_op.cc
浏览文件 @
4840c49b
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase {
...
@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase {
auto
&
place
=
places
[
place_idx
];
auto
&
place
=
places
[
place_idx
];
auto
*
cur_scope
=
sub_scopes
[
place_idx
];
auto
*
cur_scope
=
sub_scopes
[
place_idx
];
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
workers
.
emplace_back
(
framework
::
Executor
executor
(
place
);
framework
::
Async
([
program
,
cur_scope
,
place
,
block
,
place_idx
]
{
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
// Give the thread an id to distinguish parallel block with same id.
false
/*create_local_scope*/
);
platform
::
RecordThread
rt
(
static_cast
<
int
>
(
place_idx
)
+
1
);
}));
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
}));
}
}
for
(
auto
&
worker
:
workers
)
{
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
worker
.
wait
();
...
@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
...
@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto
*
cur_scope
=
sub_scopes
[
i
];
auto
*
cur_scope
=
sub_scopes
[
i
];
// execute
// execute
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
workers
.
emplace_back
(
framework
::
Executor
executor
(
place
);
framework
::
Async
([
program
,
cur_scope
,
place
,
block
,
i
]
{
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
// Give the thread an id to distinguish parallel block with same id.
false
/*create_local_scope*/
);
platform
::
RecordThread
rt
(
static_cast
<
int
>
(
i
)
+
1
);
}));
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
}));
}
}
for
(
auto
&
worker
:
workers
)
{
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
worker
.
wait
();
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
4840c49b
...
@@ -26,8 +26,14 @@ limitations under the License. */
...
@@ -26,8 +26,14 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
namespace
{
namespace
{
// Current thread's id. Note, we don't distinguish nested threads
// for now.
thread_local
int
cur_thread_id
=
0
;
// Tracking the nested block stacks of each thread.
thread_local
std
::
deque
<
int
>
block_id_stack
;
// Tracking the nested event stacks.
thread_local
std
::
deque
<
std
::
string
>
annotation_stack
;
thread_local
const
char
*
cur_annotation
=
nullptr
;
std
::
once_flag
tracer_once_flag
;
std
::
once_flag
tracer_once_flag
;
DeviceTracer
*
tracer
=
nullptr
;
DeviceTracer
*
tracer
=
nullptr
;
}
// namespace
}
// namespace
...
@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
[
id
]
=
anno
;
correlations_
[
id
]
=
anno
;
}
}
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
{
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
if
(
!
anno
)
{
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
{
// TODO(panyx0718): Currently, it doesn't support nested situation
if
(
anno
.
empty
())
{
// Up-level can be cleared by low-level and therefore get nullptr
VLOG
(
1
)
<<
"Empty timeline annotation."
;
// here.
return
;
return
;
}
}
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
cpu_records_
.
push_back
(
CPURecord
{
anno
,
start_ns
,
end_ns
,
0
});
cpu_records_
.
push_back
(
CPURecord
{
anno
,
start_ns
,
end_ns
,
device_id
,
thread_id
});
}
}
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32_t
device_id
,
uint32
_t
stream_id
,
uint64_t
end_ns
,
int64_t
device_id
,
int64
_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
{
uint32_t
correlation_id
,
uint64_t
bytes
)
{
// 0 means timestamp information could not be collected for the kernel.
// 0 means timestamp information could not be collected for the kernel.
if
(
start_ns
==
0
||
end_ns
==
0
)
{
if
(
start_ns
==
0
||
end_ns
==
0
)
{
...
@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
stream_id
,
correlation_id
,
bytes
});
}
}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32
_t
device_id
,
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
int64
_t
device_id
,
uint32
_t
stream_id
,
uint32_t
correlation_id
)
{
int64
_t
stream_id
,
uint32_t
correlation_id
)
{
// 0 means timestamp information could not be collected for the kernel.
// 0 means timestamp information could not be collected for the kernel.
if
(
start
==
0
||
end
==
0
)
{
if
(
start
==
0
||
end
==
0
)
{
VLOG
(
3
)
<<
correlation_id
<<
" cannot be traced"
;
VLOG
(
3
)
<<
correlation_id
<<
" cannot be traced"
;
...
@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
continue
;
continue
;
}
}
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
GPUKernel
);
event
->
set_name
(
correlations_
.
at
(
r
.
correlation_id
));
event
->
set_name
(
correlations_
.
at
(
r
.
correlation_id
));
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_s
tream
_id
(
r
.
stream_id
);
event
->
set_s
ub_device
_id
(
r
.
stream_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
for
(
const
CPURecord
&
r
:
cpu_records_
)
{
for
(
const
CPURecord
&
r
:
cpu_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
event
->
set_name
(
r
.
name
);
event
->
set_name
(
r
.
name
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_s
tream
_id
(
r
.
thread_id
);
event
->
set_s
ub_device
_id
(
r
.
thread_id
);
event
->
set_device_id
(
-
1
);
event
->
set_device_id
(
r
.
device_id
);
}
}
for
(
const
MemRecord
&
r
:
mem_records_
)
{
for
(
const
MemRecord
&
r
:
mem_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
GPUKernel
);
event
->
set_name
(
r
.
name
);
event
->
set_name
(
r
.
name
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_s
tream
_id
(
r
.
stream_id
);
event
->
set_s
ub_device
_id
(
r
.
stream_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
}
}
...
@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
if
((
domain
==
CUPTI_CB_DOMAIN_DRIVER_API
)
&&
if
((
domain
==
CUPTI_CB_DOMAIN_DRIVER_API
)
&&
(
cbid
==
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
))
{
(
cbid
==
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
))
{
if
(
cbInfo
->
callbackSite
==
CUPTI_API_ENTER
)
{
if
(
cbInfo
->
callbackSite
==
CUPTI_API_ENTER
)
{
const
std
::
string
anno
=
const
std
::
string
anno
=
!
annotation_stack
.
empty
()
cur_annotation
?
cur_annotation
:
cbInfo
->
symbolName
;
?
annotation_stack
.
back
()
:
cbInfo
->
symbolName
;
tracer
->
AddAnnotation
(
cbInfo
->
correlationId
,
anno
);
tracer
->
AddAnnotation
(
cbInfo
->
correlationId
,
anno
);
}
}
}
else
{
}
else
{
...
@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
...
@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
{}
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
{}
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
{}
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
{}
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32_t
device_id
,
uint32
_t
stream_id
,
uint64_t
end_ns
,
int64_t
device_id
,
int64
_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
{}
uint32_t
correlation_id
,
uint64_t
bytes
)
{}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32
_t
device_id
,
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
int64
_t
device_id
,
uint32
_t
stream_id
,
uint32_t
correlation_id
)
{}
int64
_t
stream_id
,
uint32_t
correlation_id
)
{}
bool
IsEnabled
()
{
return
false
;
}
bool
IsEnabled
()
{
return
false
;
}
...
@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
...
@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
return
tracer
;
return
tracer
;
}
}
void
SetCurAnnotation
(
const
char
*
anno
)
{
cur_annotation
=
anno
;
}
void
SetCurAnnotation
(
const
std
::
string
&
anno
)
{
annotation_stack
.
push_back
(
anno
);
}
void
ClearCurAnnotation
()
{
annotation_stack
.
pop_back
();
}
std
::
string
CurAnnotation
()
{
if
(
annotation_stack
.
empty
())
return
""
;
return
annotation_stack
.
back
();
}
void
SetCurBlock
(
int
block_id
)
{
block_id_stack
.
push_back
(
block_id
);
}
void
ClearCurBlock
()
{
block_id_stack
.
pop_back
();
}
int
BlockDepth
()
{
return
block_id_stack
.
size
();
}
void
SetCurThread
(
int
thread_id
)
{
cur_thread_id
=
thread_id
;
}
void
ClearCur
Annotation
()
{
cur_annotation
=
nullptr
;
}
void
ClearCur
Thread
()
{
cur_thread_id
=
0
;
}
const
char
*
CurAnnotation
()
{
return
cur_annotation
;
}
int
CurThread
()
{
return
cur_thread_id
;
}
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/platform/device_tracer.h
浏览文件 @
4840c49b
...
@@ -32,22 +32,23 @@ class DeviceTracer {
...
@@ -32,22 +32,23 @@ class DeviceTracer {
struct
KernelRecord
{
struct
KernelRecord
{
uint64_t
start_ns
;
uint64_t
start_ns
;
uint64_t
end_ns
;
uint64_t
end_ns
;
uint32
_t
device_id
;
int64
_t
device_id
;
uint32
_t
stream_id
;
int64
_t
stream_id
;
uint32_t
correlation_id
;
uint32_t
correlation_id
;
};
};
struct
CPURecord
{
struct
CPURecord
{
std
::
string
name
;
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
start_ns
;
uint64_t
end_ns
;
uint64_t
end_ns
;
uint64_t
thread_id
;
int64_t
device_id
;
int64_t
thread_id
;
};
};
struct
MemRecord
{
struct
MemRecord
{
std
::
string
name
;
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
start_ns
;
uint64_t
end_ns
;
uint64_t
end_ns
;
uint32
_t
device_id
;
int64
_t
device_id
;
uint32
_t
stream_id
;
int64
_t
stream_id
;
uint32_t
correlation_id
;
uint32_t
correlation_id
;
uint64_t
bytes
;
uint64_t
bytes
;
};
};
...
@@ -64,18 +65,18 @@ class DeviceTracer {
...
@@ -64,18 +65,18 @@ class DeviceTracer {
virtual
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
=
0
;
virtual
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
=
0
;
virtual
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
virtual
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32
_t
device_id
,
uint64_t
end_ns
,
int64
_t
device_id
,
uint32
_t
stream_id
,
uint32_t
correlation_id
,
int64
_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
=
0
;
uint64_t
bytes
)
=
0
;
virtual
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
=
0
;
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
// added before for human readability.
virtual
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
virtual
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
int64_t
device_id
,
uint32_t
device_id
,
uint32_t
stream_id
,
int64_t
stream_id
,
uint32_t
correlation_id
)
=
0
;
uint32_t
correlation_id
)
=
0
;
// Generate a proto after done (Disabled).
// Generate a proto after done (Disabled).
virtual
proto
::
Profile
GenProfile
(
const
std
::
string
&
profile_path
)
=
0
;
virtual
proto
::
Profile
GenProfile
(
const
std
::
string
&
profile_path
)
=
0
;
...
@@ -87,10 +88,18 @@ class DeviceTracer {
...
@@ -87,10 +88,18 @@ class DeviceTracer {
DeviceTracer
*
GetDeviceTracer
();
DeviceTracer
*
GetDeviceTracer
();
// Set a name for the cuda kernel operation being launched by the thread.
// Set a name for the cuda kernel operation being launched by the thread.
void
SetCurAnnotation
(
const
char
*
anno
);
void
SetCurAnnotation
(
const
std
::
string
&
anno
);
// Clear the name after the operation is done.
// Clear the name after the operation is done.
void
ClearCurAnnotation
();
void
ClearCurAnnotation
();
// Current name of the operation being run in the thread.
// Current name of the operation being run in the thread.
const
char
*
CurAnnotation
();
std
::
string
CurAnnotation
();
void
SetCurBlock
(
int
block_id
);
void
ClearCurBlock
();
int
BlockDepth
();
void
SetCurThread
(
int
thread_id
);
void
ClearCurThread
();
int
CurThread
();
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/platform/profiler.cc
浏览文件 @
4840c49b
...
@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
...
@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
name_
=
name
;
name_
=
name
;
PushEvent
(
name_
,
dev_ctx_
);
PushEvent
(
name_
,
dev_ctx_
);
// Maybe need the same push/pop behavior.
// Maybe need the same push/pop behavior.
SetCurAnnotation
(
name_
.
c_str
()
);
SetCurAnnotation
(
name_
);
}
}
RecordEvent
::~
RecordEvent
()
{
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
if
(
tracer
)
{
tracer
->
AddCPURecords
(
CurAnnotation
(),
start_ns_
,
PosixInNsec
());
tracer
->
AddCPURecords
(
CurAnnotation
(),
start_ns_
,
PosixInNsec
(),
BlockDepth
(),
CurThread
());
}
}
ClearCurAnnotation
();
ClearCurAnnotation
();
PopEvent
(
name_
,
dev_ctx_
);
PopEvent
(
name_
,
dev_ctx_
);
}
}
RecordBlock
::
RecordBlock
(
int
block_id
)
:
start_ns_
(
PosixInNsec
())
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
SetCurBlock
(
block_id
);
name_
=
string
::
Sprintf
(
"block_%d"
,
block_id
);
}
RecordBlock
::~
RecordBlock
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer
->
AddCPURecords
(
name_
,
start_ns_
,
PosixInNsec
(),
BlockDepth
(),
CurThread
());
}
ClearCurBlock
();
}
RecordThread
::
RecordThread
(
int
thread_id
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
SetCurThread
(
thread_id
);
}
RecordThread
::~
RecordThread
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
ClearCurThread
();
}
void
EnableProfiler
(
ProfilerState
state
)
{
void
EnableProfiler
(
ProfilerState
state
)
{
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
PADDLE_ENFORCE
(
state
!=
ProfilerState
::
kDisabled
,
"Can't enbale profling, since the input state is "
,
"Can't enbale profling, since the input state is "
,
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
4840c49b
...
@@ -118,6 +118,24 @@ struct RecordEvent {
...
@@ -118,6 +118,24 @@ struct RecordEvent {
std
::
string
full_name_
;
std
::
string
full_name_
;
};
};
struct
RecordBlock
{
explicit
RecordBlock
(
int
block_id
);
~
RecordBlock
();
private:
std
::
string
name_
;
uint64_t
start_ns_
;
int
block_id_
;
};
struct
RecordThread
{
explicit
RecordThread
(
int
thread_id
);
~
RecordThread
();
private:
uint64_t
start_ns_
;
};
// Return the event list of all threads. Assumed the returned value calls
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
...
...
paddle/fluid/platform/profiler.proto
浏览文件 @
4840c49b
...
@@ -18,12 +18,17 @@ package paddle.platform.proto;
...
@@ -18,12 +18,17 @@ package paddle.platform.proto;
message
MemCopy
{
optional
uint64
bytes
=
1
;
}
message
MemCopy
{
optional
uint64
bytes
=
1
;
}
message
Event
{
message
Event
{
enum
EventType
{
CPU
=
0
;
GPUKernel
=
1
;
}
optional
EventType
type
=
8
;
optional
string
name
=
1
;
optional
string
name
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
optional
uint64
end_ns
=
3
;
// When positive, it represents gpu id. When -1, it represents CPU.
// When positive, it represents gpu id. When -1, it represents CPU.
optional
int64
device_id
=
5
;
optional
int64
device_id
=
5
;
optional
uint32
stream
_id
=
6
;
optional
int64
sub_device
_id
=
6
;
optional
MemCopy
memcopy
=
7
;
optional
MemCopy
memcopy
=
7
;
}
}
...
...
python/paddle/fluid/tests/unittests/test_profiler.py
浏览文件 @
4840c49b
...
@@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase):
...
@@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase):
with
fluid
.
program_guard
(
main_program
,
startup_program
):
with
fluid
.
program_guard
(
main_program
,
startup_program
):
image
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
784
],
dtype
=
'float32'
)
image
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
784
],
dtype
=
'float32'
)
hidden1
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
128
,
act
=
'relu'
)
hidden1
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
64
,
act
=
'relu'
)
hidden2
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
64
,
act
=
'relu'
)
i
=
layers
.
zeros
(
shape
=
[
1
],
dtype
=
'int64'
)
counter
=
fluid
.
layers
.
zeros
(
shape
=
[
1
],
dtype
=
'int64'
,
force_cpu
=
True
)
until
=
layers
.
fill_constant
([
1
],
dtype
=
'int64'
,
value
=
10
)
data_arr
=
layers
.
array_write
(
hidden1
,
i
)
cond
=
fluid
.
layers
.
less_than
(
x
=
counter
,
y
=
until
)
while_op
=
fluid
.
layers
.
While
(
cond
=
cond
)
with
while_op
.
block
():
hidden_n
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
64
,
act
=
'relu'
)
layers
.
array_write
(
hidden_n
,
i
,
data_arr
)
fluid
.
layers
.
increment
(
x
=
counter
,
value
=
1
,
in_place
=
True
)
layers
.
less_than
(
x
=
counter
,
y
=
until
,
cond
=
cond
)
hidden_n
=
layers
.
array_read
(
data_arr
,
i
)
hidden2
=
fluid
.
layers
.
fc
(
input
=
hidden_n
,
size
=
64
,
act
=
'relu'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
10
,
act
=
'softmax'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
10
,
act
=
'softmax'
)
label
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
...
...
tools/timeline.py
浏览文件 @
4840c49b
...
@@ -121,27 +121,34 @@ class Timeline(object):
...
@@ -121,27 +121,34 @@ class Timeline(object):
def
_allocate_pids
(
self
):
def
_allocate_pids
(
self
):
for
event
in
self
.
_profile_pb
.
events
:
for
event
in
self
.
_profile_pb
.
events
:
if
event
.
device_id
not
in
self
.
_devices
:
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
pid
=
self
.
_allocate_pid
()
if
(
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
self
.
_devices
[
event
.
device_id
]
=
pid
pid
=
self
.
_allocate_pid
()
if
event
.
device_id
>=
0
:
self
.
_devices
[(
event
.
device_id
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"gpu:%s:stream:%d"
%
self
.
_chrome_trace
.
emit_pid
(
"cpu:block:%d"
%
(
pid
,
event
.
stream_id
),
pid
)
(
event
.
device_id
),
pid
)
elif
event
.
device_id
==
-
1
:
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
self
.
_chrome_trace
.
emit_pid
(
"cpu:thread_hash:%d"
%
if
(
event
.
device_id
,
"GPUKernel"
)
not
in
self
.
_devices
:
event
.
stream_id
,
pid
)
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"gpu:%d"
%
(
event
.
device_id
),
pid
)
def
_allocate_events
(
self
):
def
_allocate_events
(
self
):
for
event
in
self
.
_profile_pb
.
events
:
for
event
in
self
.
_profile_pb
.
events
:
pid
=
self
.
_devices
[
event
.
device_id
]
if
event
.
type
==
profiler_pb2
.
Event
.
CPU
:
type
=
"CPU"
elif
event
.
type
==
profiler_pb2
.
Event
.
GPUKernel
:
type
=
"GPUKernel"
pid
=
self
.
_devices
[(
event
.
device_id
,
type
)]
args
=
{
'name'
:
event
.
name
}
args
=
{
'name'
:
event
.
name
}
if
event
.
memcopy
.
bytes
>
0
:
if
event
.
memcopy
.
bytes
>
0
:
args
=
{
'mem_bytes'
:
event
.
memcopy
.
bytes
}
args
=
{
'mem_bytes'
:
event
.
memcopy
.
bytes
}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
# ops takes micro-seconds. Hence, we keep the ns here.
self
.
_chrome_trace
.
emit_region
(
event
.
start_ns
,
self
.
_chrome_trace
.
emit_region
(
(
event
.
end_ns
-
event
.
start_ns
)
/
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
1.0
,
pid
,
0
,
'Op'
,
event
.
name
,
args
)
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
def
generate_chrome_trace
(
self
):
def
generate_chrome_trace
(
self
):
self
.
_allocate_pids
()
self
.
_allocate_pids
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录