Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
92974d41
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
92974d41
编写于
3月 02, 2018
作者:
X
Xin Pan
提交者:
GitHub
3月 02, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #8679 from panyx0718/test2
Add CPU time and MemCopy to the timeline.
上级
0240bb77
f3cbfc02
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
157 addition
and
16 deletion
+157
-16
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+93
-3
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+25
-1
paddle/fluid/platform/dynload/cupti.h
paddle/fluid/platform/dynload/cupti.h
+2
-1
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+17
-6
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+1
-0
paddle/fluid/platform/profiler.proto
paddle/fluid/platform/profiler.proto
+6
-1
tools/timeline.py
tools/timeline.py
+13
-4
未找到文件。
paddle/fluid/platform/device_tracer.cc
浏览文件 @
92974d41
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <map>
#include <mutex>
#include <numeric>
#include <thread>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
...
...
@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
} \
} while (0)
std
::
string
MemcpyKind
(
CUpti_ActivityMemcpyKind
kind
)
{
switch
(
kind
)
{
case
CUPTI_ACTIVITY_MEMCPY_KIND_HTOD
:
return
"MEMCPY_HtoD"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_DTOH
:
return
"MEMCPY_DtoH"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_HTOA
:
return
"MEMCPY_HtoA"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_ATOH
:
return
"MEMCPY_AtoH"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_ATOA
:
return
"MEMCPY_AtoA"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_ATOD
:
return
"MEMCPY_AtoD"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_DTOA
:
return
"MEMCPY_DtoA"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_DTOD
:
return
"MEMCPY_DtoD"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_HTOH
:
return
"MEMCPY_HtoH"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_PTOP
:
return
"MEMCPY_PtoP"
;
case
CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT
:
return
"MEMCPY_FORCE_INT"
;
default:
break
;
}
return
"MEMCPY"
;
}
void
EnableActivity
()
{
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
...
...
@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
kernel
->
correlationId
);
break
;
}
case
CUPTI_ACTIVITY_KIND_MEMCPY
:
{
auto
*
memcpy
=
reinterpret_cast
<
const
CUpti_ActivityMemcpy
*>
(
record
);
tracer
->
AddMemRecords
(
MemcpyKind
(
static_cast
<
CUpti_ActivityMemcpyKind
>
(
memcpy
->
copyKind
)),
memcpy
->
start
,
memcpy
->
end
,
memcpy
->
deviceId
,
memcpy
->
streamId
,
memcpy
->
correlationId
,
memcpy
->
bytes
);
break
;
}
case
CUPTI_ACTIVITY_KIND_MEMCPY2
:
{
auto
*
memcpy
=
reinterpret_cast
<
const
CUpti_ActivityMemcpy2
*>
(
record
);
tracer
->
AddMemRecords
(
MemcpyKind
(
static_cast
<
CUpti_ActivityMemcpyKind
>
(
memcpy
->
copyKind
)),
memcpy
->
start
,
memcpy
->
end
,
memcpy
->
deviceId
,
memcpy
->
streamId
,
memcpy
->
correlationId
,
memcpy
->
bytes
);
break
;
}
default:
{
break
;
}
}
}
else
if
(
status
==
CUPTI_ERROR_MAX_LIMIT_REACHED
)
{
...
...
@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
[
id
]
=
anno
;
}
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
cpu_records_
.
push_back
(
CPURecord
{
anno
,
start_ns
,
end_ns
,
std
::
hash
<
std
::
thread
::
id
>
{}(
std
::
this_thread
::
get_id
())});
}
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
{
mem_records_
.
push_back
(
MemRecord
{
name
,
start_ns
,
end_ns
,
device_id
,
stream_id
,
correlation_id
,
bytes
});
}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
...
...
@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL
(
dynload
::
cuptiEnableCallback
(
1
,
subscriber_
,
CUPTI_CB_DOMAIN_DRIVER_API
,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel
));
CUPTI_CALL
(
dynload
::
cuptiGetTimestamp
(
&
start_ns_
));
enabled_
=
true
;
}
...
...
@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
proto
::
Profile
profile_pb
;
profile_pb
.
set_start_ns
(
start_ns_
);
profile_pb
.
set_end_ns
(
end_ns_
);
std
::
map
<
std
::
string
,
std
::
vector
<
uint64_t
>>
event_times
;
for
(
const
KernelRecord
&
r
:
kernel_records_
)
{
if
(
correlations_
.
find
(
r
.
correlation_id
)
==
correlations_
.
end
())
{
fprintf
(
stderr
,
"cannot relate a kernel activity
\n
"
);
...
...
@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_stream_id
(
r
.
stream_id
);
event
->
set_device_id
(
r
.
device_id
);
event_times
[
event
->
name
()].
push_back
(
r
.
end_ns
-
r
.
start_ns
);
}
for
(
const
CPURecord
&
r
:
cpu_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_name
(
r
.
name
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_stream_id
(
r
.
thread_id
);
event
->
set_device_id
(
-
1
);
}
for
(
const
MemRecord
&
r
:
mem_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_name
(
r
.
name
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_stream_id
(
r
.
stream_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
}
std
::
string
profile_str
;
google
::
protobuf
::
TextFormat
::
PrintToString
(
profile_pb
,
&
profile_str
);
...
...
@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t
start_ns_
;
uint64_t
end_ns_
;
std
::
vector
<
KernelRecord
>
kernel_records_
;
std
::
vector
<
MemRecord
>
mem_records_
;
std
::
vector
<
CPURecord
>
cpu_records_
;
std
::
unordered_map
<
uint32_t
,
std
::
string
>
correlations_
;
CUpti_SubscriberHandle
subscriber_
;
};
...
...
@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
{}
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
{}
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
{}
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
)
{}
...
...
@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void
ClearCurAnnotation
()
{
cur_annotation
=
nullptr
;
}
const
char
*
CurAnnotation
()
{
return
cur_annotation
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device_tracer.h
浏览文件 @
92974d41
...
...
@@ -36,6 +36,21 @@ class DeviceTracer {
uint32_t
stream_id
;
uint32_t
correlation_id
;
};
struct
CPURecord
{
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
end_ns
;
uint64_t
thread_id
;
};
struct
MemRecord
{
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
end_ns
;
uint32_t
device_id
;
uint32_t
stream_id
;
uint32_t
correlation_id
;
uint64_t
bytes
;
};
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
...
...
@@ -48,6 +63,14 @@ class DeviceTracer {
// human-readable annotations.
virtual
void
AddAnnotation
(
uint64_t
id
,
const
std
::
string
&
anno
)
=
0
;
virtual
void
AddMemRecords
(
const
std
::
string
&
name
,
uint64_t
start_ns
,
uint64_t
end_ns
,
uint32_t
device_id
,
uint32_t
stream_id
,
uint32_t
correlation_id
,
uint64_t
bytes
)
=
0
;
virtual
void
AddCPURecords
(
const
char
*
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual
void
AddKernelRecords
(
uint64_t
start
,
uint64_t
end
,
...
...
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
void
SetCurAnnotation
(
const
char
*
anno
);
// Clear the name after the operation is done.
void
ClearCurAnnotation
();
// Current name of the operation being run in the thread.
const
char
*
CurAnnotation
();
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/cupti.h
浏览文件 @
92974d41
...
...
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
CUPTI_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
);
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
92974d41
...
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <map>
#ifdef PADDLE_WITH_CUDA
...
...
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.
count
();
}
inline
uint64_t
PosixInNsec
()
{
struct
timeval
tv
;
gettimeofday
(
&
tv
,
nullptr
);
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
Event
::
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
const
DeviceContext
*
dev_ctx
)
:
kind_
(
kind
),
name_
(
name
),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
...
...
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList
().
Record
(
EventKind
::
kPopRange
,
name
,
g_thread_id
,
dev_ctx
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
:
start_ns_
(
PosixInNsec
()
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
name_
=
name
;
...
...
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
tracer
->
AddCPURecords
(
CurAnnotation
(),
start_ns_
,
PosixInNsec
());
}
ClearCurAnnotation
();
PopEvent
(
name_
,
dev_ctx_
);
}
...
...
@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark
(
"_stop_profiler_"
,
nullptr
);
g_state
=
ProfilerState
::
kDisabled
;
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
sorted_key
);
ResetProfiler
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
g_profiler_place
==
"All"
&&
tracer
&&
tracer
->
IsEnabled
())
{
tracer
->
Disable
();
tracer
->
GenProfile
(
profile_path
);
}
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
sorted_key
);
ResetProfiler
();
}
void
ParseEvents
(
std
::
vector
<
std
::
vector
<
Event
>>&
events
,
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
92974d41
...
...
@@ -108,6 +108,7 @@ struct RecordEvent {
~
RecordEvent
();
uint64_t
start_ns_
;
// The device context is used by Event to get the current cuda stream.
const
DeviceContext
*
dev_ctx_
;
// Event name
...
...
paddle/fluid/platform/profiler.proto
浏览文件 @
92974d41
...
...
@@ -15,12 +15,17 @@ limitations under the License. */
syntax
=
"proto2"
;
package
paddle
.
platform.proto
;
message
MemCopy
{
optional
uint64
bytes
=
3
;
}
message
Event
{
optional
string
name
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
optional
uint32
device_id
=
5
;
// When positive, it represents gpu id. When -1, it represents CPU.
optional
int64
device_id
=
5
;
optional
uint32
stream_id
=
6
;
optional
MemCopy
memcopy
=
7
;
}
message
Profile
{
...
...
tools/timeline.py
浏览文件 @
92974d41
...
...
@@ -124,15 +124,24 @@ class Timeline(object):
if
event
.
device_id
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_devices
[
event
.
device_id
]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"device:%s"
%
pid
,
pid
)
if
event
.
device_id
>=
0
:
self
.
_chrome_trace
.
emit_pid
(
"gpu:%s:stream:%d"
%
(
pid
,
event
.
stream_id
),
pid
)
elif
event
.
device_id
==
-
1
:
self
.
_chrome_trace
.
emit_pid
(
"cpu:thread_hash:%d"
%
event
.
stream_id
,
pid
)
def
_allocate_events
(
self
):
for
event
in
self
.
_profile_pb
.
events
:
pid
=
self
.
_devices
[
event
.
device_id
]
args
=
{
'name'
:
event
.
name
}
self
.
_chrome_trace
.
emit_region
(
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1000000.0
,
pid
,
0
,
'Op'
,
event
.
name
,
args
)
if
event
.
memcopy
.
bytes
>
0
:
args
=
{
'mem_bytes'
:
event
.
memcopy
.
bytes
}
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self
.
_chrome_trace
.
emit_region
(
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
0
,
'Op'
,
event
.
name
,
args
)
def
generate_chrome_trace
(
self
):
self
.
_allocate_pids
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录