Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
09799566
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
09799566
编写于
3月 12, 2019
作者:
C
chengduo
提交者:
GitHub
3月 12, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add memory profiler (#16137)
test=develop
上级
05993c3f
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
505 addition
and
77 deletion
+505
-77
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+8
-4
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+55
-4
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+21
-0
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+33
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+190
-67
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+76
-1
paddle/fluid/platform/profiler.proto
paddle/fluid/platform/profiler.proto
+17
-0
tools/timeline.py
tools/timeline.py
+104
-0
未找到文件。
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
09799566
...
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
...
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library
(
best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator
)
cc_library
(
best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator
)
cc_library
(
locked_allocator SRCS locked_allocator.cc DEPS allocator
)
cc_library
(
locked_allocator SRCS locked_allocator.cc DEPS allocator
)
cc_library
(
buffered_allocator SRCS buffered_allocator.cc DEPS allocator
)
cc_library
(
buffered_allocator SRCS buffered_allocator.cc DEPS allocator
)
cc_library
(
legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator
)
cc_library
(
legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator
profiler
)
cc_test
(
buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator
)
cc_test
(
buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
09799566
...
@@ -12,8 +12,6 @@
...
@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <utility>
#include <utility>
...
@@ -24,9 +22,11 @@
...
@@ -24,9 +22,11 @@
#endif
#endif
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/split.h"
...
@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
...
@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
}
// namespace legacy
}
// namespace legacy
namespace
allocation
{
namespace
allocation
{
LegacyMemMonitor
GPUMemMonitor
;
LegacyMemMonitor
GPUMemMonitor
;
Allocation
*
LegacyAllocator
::
AllocateImpl
(
size_t
size
,
Allocator
::
Attr
attr
)
{
Allocation
*
LegacyAllocator
::
AllocateImpl
(
size_t
size
,
Allocator
::
Attr
attr
)
{
void
*
ptr
=
boost
::
apply_visitor
(
legacy
::
AllocVisitor
(
size
),
place_
);
void
*
ptr
=
boost
::
apply_visitor
(
legacy
::
AllocVisitor
(
size
),
place_
);
return
new
Allocation
(
ptr
,
size
,
place_
);
auto
*
tmp_alloc
=
new
Allocation
(
ptr
,
size
,
place_
);
platform
::
MemEvenRecorder
::
Instance
().
PushMemRecord
(
static_cast
<
void
*>
(
tmp_alloc
),
place_
,
size
);
return
tmp_alloc
;
}
}
void
LegacyAllocator
::
Free
(
Allocation
*
allocation
)
{
void
LegacyAllocator
::
Free
(
Allocation
*
allocation
)
{
boost
::
apply_visitor
(
boost
::
apply_visitor
(
legacy
::
FreeVisitor
(
allocation
->
ptr
(),
allocation
->
size
()),
legacy
::
FreeVisitor
(
allocation
->
ptr
(),
allocation
->
size
()),
allocation
->
place
());
allocation
->
place
());
platform
::
MemEvenRecorder
::
Instance
().
PopMemRecord
(
static_cast
<
void
*>
(
allocation
),
place_
);
delete
allocation
;
delete
allocation
;
}
}
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
09799566
...
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque>
#include <deque>
#include <forward_list>
#include <forward_list>
...
@@ -30,6 +29,8 @@ limitations under the License. */
...
@@ -30,6 +29,8 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
stream_id
,
correlation_id
,
bytes
});
}
}
void
AddMemInfoRecord
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
alloc_in
,
const
std
::
string
&
free_in
,
int64_t
thread_id
)
{
if
(
0
==
start_ns
||
0
==
end_ns
)
{
VLOG
(
3
)
<<
alloc_in
<<
", "
<<
free_in
<<
" Cannot be traced."
;
return
;
}
thread_local
std
::
forward_list
<
MemInfoRecord
>
*
local_mem_info_record
=
nullptr
;
if
(
local_mem_info_record
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
mem_info_record_
.
emplace_front
();
local_mem_info_record
=
&
mem_info_record_
.
front
();
}
local_mem_info_record
->
emplace_front
(
MemInfoRecord
{
start_ns
,
end_ns
,
bytes
,
place
,
thread_id
,
alloc_in
,
free_in
});
}
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
{
int64_t
thread_id
,
uint32_t
correlation_id
)
{
...
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
.
clear
();
correlations_
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
mem_info_record_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
}
}
...
@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto
::
Profile
profile_pb
;
proto
::
Profile
profile_pb
;
profile_pb
.
set_start_ns
(
start_ns_
);
profile_pb
.
set_start_ns
(
start_ns_
);
profile_pb
.
set_end_ns
(
end_ns_
);
profile_pb
.
set_end_ns
(
end_ns_
);
if
(
correlations_
.
empty
())
if
(
correlations_
.
empty
())
{
for
(
auto
&
tmp
:
correlations_pairs
)
for
(
auto
&
tmp
:
correlations_pairs
)
{
for
(
auto
&
pair
:
tmp
)
correlations_
[
pair
.
first
]
=
pair
.
second
;
for
(
auto
&
pair
:
tmp
)
correlations_
[
pair
.
first
]
=
pair
.
second
;
}
}
for
(
const
KernelRecord
&
r
:
kernel_records_
)
{
for
(
const
KernelRecord
&
r
:
kernel_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
GPUKernel
);
event
->
set_type
(
proto
::
Event
::
GPUKernel
);
...
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
...
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
}
}
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
...
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
}
}
VLOG
(
1
)
<<
"MemRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
VLOG
(
1
)
<<
"MemRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
mem_info_record_
)
{
for
(
const
auto
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_mem_events
();
event
->
set_device_id
(
0
);
if
(
platform
::
is_cpu_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CPUPlace
);
}
else
if
(
platform
::
is_gpu_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CUDAPlace
);
event
->
set_device_id
(
boost
::
get
<
platform
::
CUDAPlace
>
(
r
.
place
).
GetDeviceId
());
}
else
if
(
platform
::
is_cuda_pinned_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CUDAPinnedPlace
);
}
else
{
PADDLE_THROW
(
"The current place is not supported."
);
}
event
->
set_alloc_in
(
r
.
alloc_in
);
event
->
set_free_in
(
r
.
free_in
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_bytes
(
r
.
bytes
);
event
->
set_thread_id
(
r
.
thread_id
);
}
}
std
::
ofstream
profile_f
;
std
::
ofstream
profile_f
;
profile_f
.
open
(
profile_path
,
profile_f
.
open
(
profile_path
,
std
::
ios
::
out
|
std
::
ios
::
trunc
|
std
::
ios
::
binary
);
std
::
ios
::
out
|
std
::
ios
::
trunc
|
std
::
ios
::
binary
);
...
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
MemInfoRecord
>>
mem_info_record_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
correlations_pairs
;
correlations_pairs
;
...
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
...
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
return
annotation_stack
.
back
();
return
annotation_stack
.
back
();
}
}
std
::
string
CurAnnotationName
()
{
std
::
string
CurAnnotationName
()
{
if
(
annotation_stack
.
empty
())
return
""
;
if
(
annotation_stack
.
empty
())
return
"
Unknown
"
;
return
annotation_stack
.
back
()
->
name
();
return
annotation_stack
.
back
()
->
name
();
}
}
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
09799566
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler.pb.h"
...
@@ -47,6 +48,7 @@ class DeviceTracer {
...
@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t
stream_id
;
int64_t
stream_id
;
uint32_t
correlation_id
;
uint32_t
correlation_id
;
};
};
struct
CPURecord
{
struct
CPURecord
{
std
::
string
name
;
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
start_ns
;
...
@@ -54,6 +56,7 @@ class DeviceTracer {
...
@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t
device_id
;
int64_t
device_id
;
int64_t
thread_id
;
int64_t
thread_id
;
};
};
struct
MemRecord
{
struct
MemRecord
{
std
::
string
name
;
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
start_ns
;
...
@@ -63,6 +66,17 @@ class DeviceTracer {
...
@@ -63,6 +66,17 @@ class DeviceTracer {
uint32_t
correlation_id
;
uint32_t
correlation_id
;
uint64_t
bytes
;
uint64_t
bytes
;
};
};
struct
MemInfoRecord
{
uint64_t
start_ns
;
uint64_t
end_ns
;
size_t
bytes
;
Place
place
;
int64_t
thread_id
;
std
::
string
alloc_in
;
std
::
string
free_in
;
};
struct
ActiveKindRecord
{
struct
ActiveKindRecord
{
std
::
string
name
;
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
start_ns
;
...
@@ -71,6 +85,7 @@ class DeviceTracer {
...
@@ -71,6 +85,7 @@ class DeviceTracer {
int64_t
thread_id
;
int64_t
thread_id
;
uint32_t
correlation_id
;
uint32_t
correlation_id
;
};
};
virtual
~
DeviceTracer
()
{}
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
virtual
void
Enable
()
=
0
;
...
@@ -97,6 +112,12 @@ class DeviceTracer {
...
@@ -97,6 +112,12 @@ class DeviceTracer {
int64_t
thread_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
=
0
;
uint32_t
correlation_id
)
=
0
;
virtual
void
AddMemInfoRecord
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
alloc_in
,
const
std
::
string
&
free_in
,
int64_t
thread_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
// added before for human readability.
virtual
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
virtual
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
...
...
paddle/fluid/platform/event.h
浏览文件 @
09799566
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <string>
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#endif
#endif
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -64,5 +66,36 @@ class Event {
...
@@ -64,5 +66,36 @@ class Event {
#endif
#endif
#endif
#endif
};
};
class
MemEvent
{
public:
MemEvent
(
EventType
type
,
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
Place
place
,
int64_t
thread_id
,
const
std
::
string
&
annotation
)
:
type_
(
type
),
start_ns_
(
start_ns
),
end_ns_
(
end_ns
),
bytes_
(
bytes
),
place_
(
place
),
thread_id_
(
thread_id
),
annotation_
(
annotation
)
{}
const
EventType
&
type
()
const
{
return
type_
;
}
uint64_t
start_ns
()
const
{
return
start_ns_
;
}
uint64_t
end_ns
()
const
{
return
end_ns_
;
}
size_t
bytes
()
const
{
return
bytes_
;
}
Place
place
()
const
{
return
place_
;
}
int64_t
thread_id
()
const
{
return
thread_id_
;
}
const
std
::
string
&
annotation
()
const
{
return
annotation_
;
}
private:
EventType
type_
;
uint64_t
start_ns_
=
0
;
uint64_t
end_ns_
=
0
;
size_t
bytes_
;
Place
place_
;
int64_t
thread_id_
;
std
::
string
annotation_
;
};
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/platform/profiler.cc
浏览文件 @
09799566
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
#include <algorithm>
#include <algorithm>
#include <iomanip>
#include <iomanip>
#include <limits>
#include <limits>
...
@@ -21,6 +20,8 @@ limitations under the License. */
...
@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <mutex> // NOLINT
#include <random>
#include <random>
#include <string>
#include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#endif // PADDLE_WITH_CUDA
...
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
...
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
struct
EventList
;
static
int64_t
profiler_lister_id
=
0
;
static
int64_t
profiler_lister_id
=
0
;
static
bool
should_send_profile_state
=
false
;
static
bool
should_send_profile_state
=
false
;
std
::
mutex
profiler_mu
;
std
::
mutex
profiler_mu
;
...
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
...
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex
// The global mutex
static
std
::
mutex
g_all_event_lists_mutex
;
static
std
::
mutex
g_all_event_lists_mutex
;
// The total event lists of all threads
// The total event lists of all threads
static
std
::
list
<
std
::
shared_ptr
<
EventList
>>
g_all_event_lists
;
static
std
::
list
<
std
::
shared_ptr
<
EventList
<
Event
>
>>
g_all_event_lists
;
// The thread local event list only can be accessed by the specific thread
// The thread local event list only can be accessed by the specific thread
static
thread_local
std
::
shared_ptr
<
EventList
>
g_event_list
;
static
thread_local
std
::
shared_ptr
<
EventList
<
Event
>>
g_event_list
;
struct
EventList
{
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
size_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
template
<
typename
...
Args
>
Event
*
Record
(
Args
&&
...
args
)
{
if
(
event_blocks
.
empty
()
||
event_blocks
.
front
().
size
()
==
kNumBlock
)
{
event_blocks
.
emplace_front
();
event_blocks
.
front
().
reserve
(
kNumBlock
);
}
event_blocks
.
front
().
emplace_back
(
std
::
forward
<
Args
>
(
args
)...);
return
&
event_blocks
.
front
().
back
();
}
std
::
vector
<
Event
>
Reduce
()
{
std
::
vector
<
Event
>
result
;
for
(
auto
&
block
:
event_blocks
)
{
result
.
insert
(
result
.
begin
(),
std
::
make_move_iterator
(
block
.
begin
()),
std
::
make_move_iterator
(
block
.
end
()));
}
event_blocks
.
clear
();
return
result
;
}
void
Clear
()
{
event_blocks
.
clear
();
}
std
::
forward_list
<
std
::
vector
<
Event
>>
event_blocks
;
static
std
::
list
<
std
::
shared_ptr
<
EventList
<
MemEvent
>>>
g_all_mem_event_lists
;
};
static
thread_local
std
::
shared_ptr
<
EventList
<
MemEvent
>>
g_mem_event_list
;
static
std
::
mutex
g_all_mem_event_lists_mutex
;
static
thread_local
int32_t
g_mem_thread_id
;
static
uint32_t
g_mem_next_thread_id
=
0
;
inline
uint64_t
GetTimeInNsec
()
{
inline
uint64_t
GetTimeInNsec
()
{
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
...
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
...
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_
=
GetTimeInNsec
();
cpu_ns_
=
GetTimeInNsec
();
}
}
const
EventType
&
Event
::
type
()
const
{
return
type_
;
}
const
EventType
&
Event
::
type
()
const
{
return
type_
;
}
double
Event
::
CpuElapsedMs
(
const
Event
&
e
)
const
{
double
Event
::
CpuElapsedMs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000000.0
);
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000000.0
);
}
}
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_CUPTI
return
gpu_ns_
/
1000000.0
;
return
gpu_ns_
/
1000000.0
;
#else
#else
...
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
...
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif
#endif
}
}
inline
EventList
&
GetEventList
()
{
inline
EventList
<
MemEvent
>
&
GetMemEventList
()
{
if
(
!
g_mem_event_list
)
{
g_mem_event_list
=
std
::
make_shared
<
EventList
<
MemEvent
>>
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_mem_event_lists_mutex
);
g_mem_thread_id
=
g_mem_next_thread_id
++
;
g_all_mem_event_lists
.
emplace_front
(
g_mem_event_list
);
}
return
*
g_mem_event_list
;
}
void
PushMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
annotation
)
{
GetMemEventList
().
Record
(
EventType
::
kPushRange
,
start_ns
,
end_ns
,
bytes
,
place
,
g_mem_thread_id
,
annotation
);
}
void
PopMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
annotation
)
{
GetMemEventList
().
Record
(
EventType
::
kPopRange
,
start_ns
,
end_ns
,
bytes
,
place
,
g_mem_thread_id
,
annotation
);
}
inline
EventList
<
Event
>
&
GetEventList
()
{
if
(
!
g_event_list
)
{
if
(
!
g_event_list
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
g_event_list
=
std
::
make_shared
<
EventList
>
();
g_event_list
=
std
::
make_shared
<
EventList
<
Event
>
>
();
g_thread_id
=
g_next_thread_id
++
;
g_thread_id
=
g_next_thread_id
++
;
g_all_event_lists
.
emplace_front
(
g_event_list
);
g_all_event_lists
.
emplace_front
(
g_event_list
);
RecoreCurThreadId
(
g_thread_id
);
RecoreCurThreadId
(
g_thread_id
);
...
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
...
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return
*
g_event_list
;
return
*
g_event_list
;
}
}
void
Mark
(
const
std
::
string
&
name
)
{
void
Mark
(
const
std
::
string
&
name
)
{
GetEventList
().
Record
(
EventType
::
kMark
,
name
,
g_thread_id
);
GetEventList
().
Record
(
EventType
::
kMark
,
name
,
g_thread_id
);
}
}
Event
*
PushEvent
(
const
std
::
string
&
name
)
{
Event
*
PushEvent
(
const
std
::
string
&
name
)
{
return
GetEventList
().
Record
(
EventType
::
kPushRange
,
name
,
g_thread_id
);
return
GetEventList
().
Record
(
EventType
::
kPushRange
,
name
,
g_thread_id
);
}
}
void
PopEvent
(
const
std
::
string
&
name
)
{
void
PopEvent
(
const
std
::
string
&
name
)
{
GetEventList
().
Record
(
EventType
::
kPopRange
,
name
,
g_thread_id
);
GetEventList
().
Record
(
EventType
::
kPopRange
,
name
,
g_thread_id
);
}
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
)
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
)
:
is_enabled_
(
false
),
start_ns_
(
PosixInNsec
())
{
:
is_enabled_
(
false
),
start_ns_
(
PosixInNsec
())
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// lock is not needed, the code below is thread-safe
// lock is not needed, the code below is thread-safe
is_enabled_
=
true
;
is_enabled_
=
true
;
name_
=
name
;
name_
=
name
;
Event
*
e
=
PushEvent
(
name_
);
Event
*
e
=
PushEvent
(
name_
);
// Maybe need the same push/pop behavior.
// Maybe need the same push/pop behavior.
SetCurAnnotation
(
e
);
SetCurAnnotation
(
e
);
}
}
...
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
...
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent
::~
RecordEvent
()
{
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
// lock is not needed, the code below is thread-safe
// lock is not needed, the code below is thread-safe
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
if
(
tracer
)
{
tracer
->
AddCPURecords
(
CurAnnotationName
(),
start_ns_
,
PosixInNsec
(),
tracer
->
AddCPURecords
(
CurAnnotationName
(),
start_ns_
,
PosixInNsec
(),
BlockDepth
(),
g_thread_id
);
BlockDepth
(),
g_thread_id
);
...
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
...
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent
(
name_
);
PopEvent
(
name_
);
}
}
RecordRPCEvent
::
RecordRPCEvent
(
const
std
::
string
&
name
)
{
MemEvenRecorder
MemEvenRecorder
::
recorder
;
void
MemEvenRecorder
::
PushMemRecord
(
const
void
*
ptr
,
const
Place
&
place
,
size_t
size
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
auto
&
events
=
address_memevent_
[
place
];
PADDLE_ENFORCE
(
events
.
count
(
ptr
)
==
0
,
""
);
events
.
emplace
(
ptr
,
std
::
unique_ptr
<
RecordMemEvent
>
(
new
MemEvenRecorder
::
RecordMemEvent
(
place
,
size
)));
}
void
MemEvenRecorder
::
PopMemRecord
(
const
void
*
ptr
,
const
Place
&
place
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
auto
&
events
=
address_memevent_
[
place
];
auto
iter
=
events
.
find
(
ptr
);
// The ptr maybe not in address_memevent
if
(
iter
!=
events
.
end
())
{
events
.
erase
(
iter
);
}
}
void
MemEvenRecorder
::
Flush
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
address_memevent_
.
clear
();
}
MemEvenRecorder
::
RecordMemEvent
::
RecordMemEvent
(
const
Place
&
place
,
size_t
bytes
)
:
place_
(
place
),
bytes_
(
bytes
),
start_ns_
(
PosixInNsec
()),
alloc_in_
(
CurAnnotationName
())
{
PushMemEvent
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
alloc_in_
);
}
MemEvenRecorder
::
RecordMemEvent
::~
RecordMemEvent
()
{
DeviceTracer
*
tracer
=
GetDeviceTracer
();
end_ns_
=
PosixInNsec
();
auto
annotation_free
=
CurAnnotationName
();
if
(
tracer
)
{
tracer
->
AddMemInfoRecord
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
alloc_in_
,
annotation_free
,
g_mem_thread_id
);
}
PopMemEvent
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
annotation_free
);
}
RecordRPCEvent
::
RecordRPCEvent
(
const
std
::
string
&
name
)
{
if
(
FLAGS_enable_rpc_profiler
)
{
if
(
FLAGS_enable_rpc_profiler
)
{
event_
.
reset
(
new
platform
::
RecordEvent
(
name
));
event_
.
reset
(
new
platform
::
RecordEvent
(
name
));
}
}
...
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
...
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock
::~
RecordBlock
()
{
RecordBlock
::~
RecordBlock
()
{
// lock is not needed, the code below is thread-safe
// lock is not needed, the code below is thread-safe
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
if
(
tracer
)
{
// We try to put all blocks at the same nested depth in the
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
// same timeline lane. and distinguish the using thread_id.
...
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
...
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void
ResetProfiler
()
{
void
ResetProfiler
()
{
SynchronizeAllDevice
();
SynchronizeAllDevice
();
GetDeviceTracer
()
->
Reset
();
GetDeviceTracer
()
->
Reset
();
MemEvenRecorder
::
Instance
().
Flush
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
++
it
)
{
(
*
it
)
->
Clear
();
(
*
it
)
->
Clear
();
}
}
for
(
auto
it
=
g_all_mem_event_lists
.
begin
();
it
!=
g_all_mem_event_lists
.
end
();
++
it
)
{
(
*
it
)
->
Clear
();
}
}
}
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
()
{
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
()
{
...
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
...
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return
result
;
return
result
;
}
}
std
::
vector
<
std
::
vector
<
MemEvent
>>
GetMemEvents
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_mem_event_lists_mutex
);
std
::
vector
<
std
::
vector
<
MemEvent
>>
result
;
for
(
auto
&
it
:
g_all_mem_event_lists
)
{
result
.
emplace_back
((
*
it
).
Reduce
());
}
return
result
;
}
// The information of each event given in the profiling report
// The information of each event given in the profiling report
struct
EventItem
{
struct
EventItem
{
std
::
string
name
;
std
::
string
name
;
...
@@ -263,8 +319,8 @@ struct EventItem {
...
@@ -263,8 +319,8 @@ struct EventItem {
};
};
// Print results
// Print results
void
PrintProfiler
(
const
std
::
vector
<
std
::
vector
<
EventItem
>>
&
events_table
,
void
PrintProfiler
(
const
std
::
vector
<
std
::
vector
<
EventItem
>>
&
events_table
,
const
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
,
bool
merge_thread
)
{
const
size_t
data_width
,
bool
merge_thread
)
{
// Output header information
// Output header information
std
::
cout
<<
"
\n
------------------------->"
std
::
cout
<<
"
\n
------------------------->"
...
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
...
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
events_table
[
i
].
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
events_table
[
i
].
size
();
++
j
)
{
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
;
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
;
...
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
...
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
}
}
// Parse the event list and output the profiling report
// Parse the event list and output the profiling report
void
ParseEvents
(
const
std
::
vector
<
std
::
vector
<
Event
>>
&
events
,
void
ParseEvents
(
const
std
::
vector
<
std
::
vector
<
Event
>>
&
events
,
bool
merge_thread
,
bool
merge_thread
,
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
)
{
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
merge_thread
&&
events
.
size
()
<
2
)
return
;
if
(
merge_thread
&&
events
.
size
()
<
2
)
return
;
std
::
string
sorted_domain
;
std
::
string
sorted_domain
;
std
::
function
<
bool
(
const
EventItem
&
,
const
EventItem
&
)
>
sorted_func
;
std
::
function
<
bool
(
const
EventItem
&
,
const
EventItem
&
)
>
sorted_func
;
switch
(
sorted_by
)
{
switch
(
sorted_by
)
{
case
EventSortingKey
::
kCalls
:
case
EventSortingKey
::
kCalls
:
sorted_domain
=
"number of calls"
;
sorted_domain
=
"number of calls"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
calls
>
b
.
calls
;
return
a
.
calls
>
b
.
calls
;
};
};
break
;
break
;
case
EventSortingKey
::
kTotal
:
case
EventSortingKey
::
kTotal
:
sorted_domain
=
"total time"
;
sorted_domain
=
"total time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
total_time
>
b
.
total_time
;
return
a
.
total_time
>
b
.
total_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kMin
:
case
EventSortingKey
::
kMin
:
sorted_domain
=
"minimum time"
;
sorted_domain
=
"minimum time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
min_time
>
b
.
min_time
;
return
a
.
min_time
>
b
.
min_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kMax
:
case
EventSortingKey
::
kMax
:
sorted_domain
=
"maximum time"
;
sorted_domain
=
"maximum time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
max_time
>
b
.
max_time
;
return
a
.
max_time
>
b
.
max_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kAve
:
case
EventSortingKey
::
kAve
:
sorted_domain
=
"average time"
;
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
ave_time
>
b
.
ave_time
;
return
a
.
ave_time
>
b
.
ave_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kGPUTime
:
case
EventSortingKey
::
kGPUTime
:
sorted_domain
=
"average time"
;
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
gpu_time
>
b
.
gpu_time
;
return
a
.
gpu_time
>
b
.
gpu_time
;
};
};
break
;
break
;
case
EventSortingKey
::
kCPUTime
:
case
EventSortingKey
::
kCPUTime
:
sorted_domain
=
"average time"
;
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
cpu_time
>
b
.
cpu_time
;
return
a
.
cpu_time
>
b
.
cpu_time
;
};
};
break
;
break
;
...
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain
=
"event first end time"
;
sorted_domain
=
"event first end time"
;
}
}
const
std
::
vector
<
std
::
vector
<
Event
>>
*
analyze_events
;
const
std
::
vector
<
std
::
vector
<
Event
>>
*
analyze_events
;
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
if
(
merge_thread
)
{
if
(
merge_thread
)
{
std
::
vector
<
Event
>
merged_events
;
std
::
vector
<
Event
>
merged_events
;
...
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
}
}
}
// average time
// average time
for
(
auto
&
item
:
event_items
)
{
for
(
auto
&
item
:
event_items
)
{
item
.
ave_time
=
item
.
total_time
/
item
.
calls
;
item
.
ave_time
=
item
.
total_time
/
item
.
calls
;
item
.
ratio
=
item
.
total_time
/
total
;
item
.
ratio
=
item
.
total_time
/
total
;
}
}
...
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread
);
merge_thread
);
}
}
struct
MemoryProfierReport
{
size_t
alloc_times
{
0
};
size_t
alloc_size
{
0
};
size_t
free_times
{
0
};
size_t
free_size
{
0
};
};
// Print results
void
PrintMemProfiler
(
const
std
::
map
<
Place
,
std
::
unordered_map
<
std
::
string
,
MemoryProfierReport
>>
&
annotation_report
,
const
size_t
name_width
,
const
size_t
data_width
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
<<
" Memory Profiling Report "
<<
"<-------------------------
\n\n
"
;
// Output events table
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
<<
"Alloc Calls"
<<
std
::
setw
(
data_width
)
<<
"Size(MB)"
<<
std
::
setw
(
data_width
)
<<
"Free Calls"
<<
std
::
setw
(
data_width
)
<<
"Size(MB)"
<<
std
::
endl
;
for
(
auto
&
tmp
:
annotation_report
)
{
for
(
auto
&
e
:
tmp
.
second
)
{
auto
event_name
=
string
::
Sprintf
(
"%s:%s"
,
tmp
.
first
,
e
.
first
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_name
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
alloc_times
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
alloc_size
/
(
1024.0
*
1024.0
);
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
free_times
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
free_size
/
(
1024.0
*
1024.0
)
<<
std
::
endl
;
}
}
std
::
cout
<<
std
::
endl
;
}
// parse memory events
void
ParseMemEvents
(
const
std
::
vector
<
std
::
vector
<
MemEvent
>>
&
events
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// place, annotation, alloc times, alloc size
std
::
map
<
Place
,
std
::
unordered_map
<
std
::
string
,
MemoryProfierReport
>>
annotation_report
;
for
(
auto
&
tmp
:
events
)
{
for
(
auto
&
e
:
tmp
)
{
if
(
e
.
type
()
==
EventType
::
kPushRange
)
{
annotation_report
[
e
.
place
()][
e
.
annotation
()].
alloc_times
+=
1
;
annotation_report
[
e
.
place
()][
e
.
annotation
()].
alloc_size
+=
e
.
bytes
();
}
else
if
(
e
.
type
()
==
EventType
::
kPopRange
)
{
annotation_report
[
e
.
place
()][
e
.
annotation
()].
free_times
+=
1
;
annotation_report
[
e
.
place
()][
e
.
annotation
()].
free_size
+=
e
.
bytes
();
}
}
}
PrintMemProfiler
(
annotation_report
,
55
,
18
);
}
void
DisableProfiler
(
EventSortingKey
sorted_key
,
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
)
{
const
std
::
string
&
profile_path
)
{
SynchronizeAllDevice
();
SynchronizeAllDevice
();
MemEvenRecorder
::
Instance
().
Flush
();
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// Mark the profiling stop.
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
);
Mark
(
"_stop_profiler_"
);
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
->
IsEnabled
())
{
if
(
tracer
->
IsEnabled
())
{
tracer
->
Disable
();
tracer
->
Disable
();
tracer
->
GenProfile
(
profile_path
);
tracer
->
GenProfile
(
profile_path
);
...
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
...
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
true
,
sorted_key
);
ParseEvents
(
all_events
,
true
,
sorted_key
);
ParseEvents
(
all_events
,
false
,
sorted_key
);
ParseEvents
(
all_events
,
false
,
sorted_key
);
if
(
VLOG_IS_ON
(
5
))
{
std
::
vector
<
std
::
vector
<
MemEvent
>>
all_mem_events
=
GetMemEvents
();
ParseMemEvents
(
all_mem_events
);
}
ResetProfiler
();
ResetProfiler
();
g_state
=
ProfilerState
::
kDisabled
;
g_state
=
ProfilerState
::
kDisabled
;
should_send_profile_state
=
true
;
should_send_profile_state
=
true
;
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
09799566
...
@@ -15,10 +15,17 @@ limitations under the License. */
...
@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once
#pragma once
#include <forward_list>
#include <forward_list>
#include <list>
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#endif
...
@@ -34,8 +41,41 @@ enum ProfilerState {
...
@@ -34,8 +41,41 @@ enum ProfilerState {
void
Mark
(
const
std
::
string
&
name
);
void
Mark
(
const
std
::
string
&
name
);
Event
*
PushEvent
(
const
std
::
string
&
name
);
void
PushMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
);
void
PopMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
);
struct
MemEvenRecorder
{
public:
void
PushMemRecord
(
const
void
*
ptr
,
const
Place
&
place
,
size_t
size
);
void
PopMemRecord
(
const
void
*
ptr
,
const
Place
&
place
);
void
Flush
();
static
MemEvenRecorder
&
Instance
()
{
return
recorder
;
}
private:
struct
RecordMemEvent
{
RecordMemEvent
(
const
Place
&
place
,
size_t
bytes
);
~
RecordMemEvent
();
Place
place_
;
size_t
bytes_
;
uint64_t
start_ns_
;
uint64_t
end_ns_
;
std
::
string
alloc_in_
;
std
::
string
free_in_
;
};
static
MemEvenRecorder
recorder
;
std
::
map
<
Place
,
std
::
unordered_map
<
const
void
*
,
std
::
unique_ptr
<
RecordMemEvent
>>>
address_memevent_
;
std
::
mutex
mtx_
;
MemEvenRecorder
()
{}
DISABLE_COPY_AND_ASSIGN
(
MemEvenRecorder
);
};
Event
*
PushEvent
(
const
std
::
string
&
name
);
void
PopEvent
(
const
std
::
string
&
name
);
void
PopEvent
(
const
std
::
string
&
name
);
struct
RecordEvent
{
struct
RecordEvent
{
...
@@ -87,6 +127,41 @@ enum EventSortingKey {
...
@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime
kGPUTime
};
};
template
<
typename
T
>
struct
EventList
{
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
T
);
constexpr
static
size_t
kEventAlign
=
alignof
(
T
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
template
<
typename
...
Args
>
T
*
Record
(
Args
&&
...
args
)
{
if
(
event_blocks
.
empty
()
||
event_blocks
.
front
().
size
()
==
kNumBlock
)
{
event_blocks
.
emplace_front
();
event_blocks
.
front
().
reserve
(
kNumBlock
);
}
event_blocks
.
front
().
emplace_back
(
std
::
forward
<
Args
>
(
args
)...);
return
&
event_blocks
.
front
().
back
();
}
std
::
vector
<
T
>
Reduce
()
{
std
::
vector
<
T
>
result
;
for
(
auto
&
block
:
event_blocks
)
{
result
.
insert
(
result
.
begin
(),
std
::
make_move_iterator
(
block
.
begin
()),
std
::
make_move_iterator
(
block
.
end
()));
}
event_blocks
.
clear
();
return
result
;
}
void
Clear
()
{
event_blocks
.
clear
();
}
std
::
forward_list
<
std
::
vector
<
T
>>
event_blocks
;
};
// Enable the profiling function.
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
void
EnableProfiler
(
ProfilerState
state
);
...
...
paddle/fluid/platform/profiler.proto
浏览文件 @
09799566
...
@@ -34,8 +34,25 @@ message Event {
...
@@ -34,8 +34,25 @@ message Event {
optional
string
detail_info
=
9
;
optional
string
detail_info
=
9
;
}
}
message
MemEvent
{
enum
Place
{
CUDAPlace
=
0
;
CPUPlace
=
1
;
CUDAPinnedPlace
=
2
;
}
optional
uint64
start_ns
=
1
;
optional
uint64
end_ns
=
2
;
optional
uint64
bytes
=
3
;
optional
Place
place
=
4
;
optional
uint64
thread_id
=
5
;
optional
uint32
device_id
=
6
;
optional
string
alloc_in
=
7
;
optional
string
free_in
=
8
;
}
message
Profile
{
message
Profile
{
repeated
Event
events
=
1
;
repeated
Event
events
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
optional
uint64
end_ns
=
3
;
repeated
MemEvent
mem_events
=
4
;
}
}
\ No newline at end of file
tools/timeline.py
浏览文件 @
09799566
...
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
...
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event
[
'args'
]
=
args
event
[
'args'
]
=
args
self
.
_events
.
append
(
event
)
self
.
_events
.
append
(
event
)
def
emit_counter
(
self
,
category
,
name
,
pid
,
timestamp
,
counter
,
value
):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event
=
self
.
_create_event
(
'C'
,
category
,
name
,
pid
,
0
,
timestamp
)
event
[
'args'
]
=
{
counter
:
value
}
self
.
_events
.
append
(
event
)
def
format_to_string
(
self
,
pretty
=
False
):
def
format_to_string
(
self
,
pretty
=
False
):
"""Formats the chrome trace to a string.
"""Formats the chrome trace to a string.
...
@@ -117,6 +133,7 @@ class Timeline(object):
...
@@ -117,6 +133,7 @@ class Timeline(object):
self
.
_profile_dict
=
profile_dict
self
.
_profile_dict
=
profile_dict
self
.
_pid
=
0
self
.
_pid
=
0
self
.
_devices
=
dict
()
self
.
_devices
=
dict
()
self
.
_mem_devices
=
dict
()
self
.
_chrome_trace
=
_ChromeTraceFormatter
()
self
.
_chrome_trace
=
_ChromeTraceFormatter
()
def
_allocate_pid
(
self
):
def
_allocate_pid
(
self
):
...
@@ -143,6 +160,45 @@ class Timeline(object):
...
@@ -143,6 +160,45 @@ class Timeline(object):
self
.
_devices
[(
k
,
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_devices
[(
k
,
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"%s:gpu:%d"
%
self
.
_chrome_trace
.
emit_pid
(
"%s:gpu:%d"
%
(
k
,
event
.
device_id
),
pid
)
(
k
,
event
.
device_id
),
pid
)
for
mevent
in
profile_pb
.
mem_events
:
if
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CUDAPlace
:
if
(
k
,
mevent
.
device_id
,
"GPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"GPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:gpu:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
elif
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CPUPlace
:
if
(
k
,
mevent
.
device_id
,
"CPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cpu:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
elif
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CUDAPinnedPlace
:
if
(
k
,
mevent
.
device_id
,
"CUDAPinnedPlace"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"CUDAPinnedPlace"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cudapinnedplace:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
if
(
k
,
0
,
"CPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cpu:%d"
%
(
k
,
0
),
pid
)
if
(
k
,
0
,
"GPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"GPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:gpu:%d"
%
(
k
,
0
),
pid
)
if
(
k
,
0
,
"CUDAPinnedPlace"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"CUDAPinnedPlace"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cudapinnedplace:%d"
%
(
k
,
0
),
pid
)
def
_allocate_events
(
self
):
def
_allocate_events
(
self
):
for
k
,
profile_pb
in
six
.
iteritems
(
self
.
_profile_dict
):
for
k
,
profile_pb
in
six
.
iteritems
(
self
.
_profile_dict
):
...
@@ -163,9 +219,57 @@ class Timeline(object):
...
@@ -163,9 +219,57 @@ class Timeline(object):
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
def
_allocate_memory_event
(
self
):
place_to_str
=
{
profiler_pb2
.
MemEvent
.
CPUPlace
:
"CPU"
,
profiler_pb2
.
MemEvent
.
CUDAPlace
:
"GPU"
,
profiler_pb2
.
MemEvent
.
CUDAPinnedPlace
:
"CUDAPinnedPlace"
}
for
k
,
profile_pb
in
six
.
iteritems
(
self
.
_profile_dict
):
mem_list
=
[]
end_profiler
=
0
for
mevent
in
profile_pb
.
mem_events
:
crt_info
=
dict
()
crt_info
[
'time'
]
=
mevent
.
start_ns
crt_info
[
'size'
]
=
mevent
.
bytes
if
mevent
.
place
in
place_to_str
:
place
=
place_to_str
[
mevent
.
place
]
else
:
place
=
"UnDefine"
crt_info
[
'place'
]
=
place
pid
=
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
place
)]
crt_info
[
'pid'
]
=
pid
crt_info
[
'thread_id'
]
=
mevent
.
thread_id
crt_info
[
'device_id'
]
=
mevent
.
device_id
mem_list
.
append
(
crt_info
)
crt_info
=
dict
()
crt_info
[
'place'
]
=
place
crt_info
[
'pid'
]
=
pid
crt_info
[
'thread_id'
]
=
mevent
.
thread_id
crt_info
[
'device_id'
]
=
mevent
.
device_id
crt_info
[
'time'
]
=
mevent
.
end_ns
crt_info
[
'size'
]
=
-
mevent
.
bytes
mem_list
.
append
(
crt_info
)
end_profiler
=
max
(
end_profiler
,
crt_info
[
'time'
])
mem_list
.
sort
(
key
=
lambda
tmp
:
(
tmp
.
get
(
'time'
,
0
)))
i
=
0
total_size
=
0
while
i
<
len
(
mem_list
):
total_size
+=
mem_list
[
i
][
'size'
]
while
i
<
len
(
mem_list
)
-
1
and
mem_list
[
i
][
'time'
]
==
mem_list
[
i
+
1
][
'time'
]:
total_size
+=
mem_list
[
i
+
1
][
'size'
]
i
+=
1
self
.
_chrome_trace
.
emit_counter
(
"Memory"
,
"Memory"
,
mem_list
[
i
][
'pid'
],
mem_list
[
i
][
'time'
],
0
,
total_size
)
i
+=
1
def
generate_chrome_trace
(
self
):
def
generate_chrome_trace
(
self
):
self
.
_allocate_pids
()
self
.
_allocate_pids
()
self
.
_allocate_events
()
self
.
_allocate_events
()
self
.
_allocate_memory_event
()
return
self
.
_chrome_trace
.
format_to_string
()
return
self
.
_chrome_trace
.
format_to_string
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录