Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
7235fd66
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7235fd66
编写于
3月 01, 2019
作者:
C
chengduo
提交者:
GitHub
3月 01, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add Event for TensorCopy (#15953)
Add Event for TensorCopy
上级
46c5e378
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
111 addition
and
23 deletion
+111
-23
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+2
-2
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+7
-0
paddle/fluid/memory/CMakeLists.txt
paddle/fluid/memory/CMakeLists.txt
+1
-1
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+20
-0
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+14
-9
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+54
-9
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+12
-1
tools/timeline.py
tools/timeline.py
+1
-1
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
7235fd66
...
@@ -38,10 +38,10 @@ if(WITH_GPU)
...
@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
nv_library
(
tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context
)
add_dependencies
(
tensor tensor_util
)
add_dependencies
(
tensor tensor_util
)
else
()
else
()
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context
profiler
)
endif
(
WIN32
)
endif
(
WIN32
)
else
()
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
)
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context
profiler
)
endif
()
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
7235fd66
...
@@ -14,8 +14,11 @@
...
@@ -14,8 +14,11 @@
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/tensor_util.h"
#include <algorithm>
#include <algorithm>
#include <limits>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -135,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->CPU"
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CPU->GPU"
);
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:GPU->GPU"
);
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
<<
dst_place
;
...
@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -155,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
RecordEvent
record_event
(
"TensorCopy:CUDAPinned->GPU"
);
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
...
...
paddle/fluid/memory/CMakeLists.txt
浏览文件 @
7235fd66
add_subdirectory
(
detail
)
add_subdirectory
(
detail
)
add_subdirectory
(
allocation
)
add_subdirectory
(
allocation
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
)
cc_library
(
malloc SRCS malloc.cc DEPS place enforce allocator_facade
profiler
)
cc_library
(
memcpy SRCS memcpy.cc DEPS place
)
cc_library
(
memcpy SRCS memcpy.cc DEPS place
)
cc_library
(
memory
cc_library
(
memory
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
7235fd66
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
...
@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template
<
>
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
// FIXME(zjl): do we really need it?
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
...
@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
const
void
*
src
,
size_t
num
,
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
// FIXME(zjl): do we really need it?
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
...
@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
...
@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if
(
dst_place
==
src_place
)
{
if
(
dst_place
==
src_place
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync(same_gpu):GPU->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToDevice
);
}
}
}
else
{
}
else
{
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerAsync:GPU->GPU"
);
platform
::
GpuMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
platform
::
GpuMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
,
stream
);
num
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyPeerSync:GPU->GPU"
);
platform
::
GpuMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
platform
::
GpuMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
);
num
);
}
}
...
@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
...
@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
src_place
.
device
);
platform
::
SetDeviceId
(
src_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:GPU->CUDAPinned"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:GPU->CUDAPinned"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
}
}
}
}
...
@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
...
@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
platform
::
SetDeviceId
(
dst_place
.
device
);
platform
::
SetDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"GpuMemcpyAsync:CUDAPinned->GPU"
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
}
else
{
platform
::
RecordEvent
record_event
(
"GpuMemcpySync:CUDAPinned->GPU"
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
}
}
}
}
...
...
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
7235fd66
...
@@ -13,9 +13,11 @@
...
@@ -13,9 +13,11 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <memory>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
reader
{
namespace
reader
{
...
@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
...
@@ -49,9 +51,10 @@ BufferedReader::BufferedReader(
.
Get
(
place_
)))
.
Get
(
place_
)))
->
stream
();
->
stream
();
events
.
resize
(
buffer_size
);
events
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
for
(
auto
&
event
:
events
)
{
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
}
}
}
#endif
#endif
cpu_buffer_
.
resize
(
buffer_size
);
cpu_buffer_
.
resize
(
buffer_size
);
...
@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -83,12 +86,15 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
// TensorCopySync would block other stream, because TensorCopySync
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
gpu
.
resize
(
cpu
.
size
());
gpu
.
resize
(
cpu
.
size
());
platform
::
RecordEvent
record_event
(
"BufferedReader:MemoryCopy"
);
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
...
@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -97,20 +103,19 @@ void BufferedReader::ReadAsync(size_t i) {
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
size
=
auto
size
=
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
cpu_ptr
,
size
,
stream
);
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
}
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
size
,
stream
);
else
}
else
{
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
0
);
stream
);
}
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
}
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
7235fd66
...
@@ -30,7 +30,6 @@ limitations under the License. */
...
@@ -30,7 +30,6 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
...
@@ -222,19 +221,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
// -1 device id represents
CUDA
api call
// -1 device id represents
ActiveKind
api call
tracer
->
Add
CPU
Records
(
tracer
->
Add
ActiveKind
Records
(
DriverKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
DriverKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
api
->
correlationId
);
}
break
;
break
;
}
}
case
CUPTI_ACTIVITY_KIND_RUNTIME
:
{
case
CUPTI_ACTIVITY_KIND_RUNTIME
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
{
tracer
->
AddCPURecords
(
// -1 device id represents ActiveKind api call
tracer
->
AddActiveKindRecords
(
RuntimeKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
RuntimeKind
(
api
->
cbid
),
api
->
start
,
api
->
end
,
-
1
,
GetThreadIdFromSystemThreadId
(
api
->
threadId
));
GetThreadIdFromSystemThreadId
(
api
->
threadId
),
api
->
correlationId
);
}
break
;
break
;
}
}
default:
{
break
;
}
default:
{
break
;
}
...
@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -313,6 +317,25 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
stream_id
,
correlation_id
,
bytes
});
}
}
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
{
if
(
anno
.
empty
())
{
VLOG
(
1
)
<<
"Empty timeline annotation."
;
return
;
}
thread_local
std
::
forward_list
<
ActiveKindRecord
>
*
local_active_kind_records
=
nullptr
;
if
(
local_active_kind_records
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
active_kind_records_
.
emplace_front
();
local_active_kind_records
=
&
active_kind_records_
.
front
();
}
// lock is not needed, only one thread call this function.
local_active_kind_records
->
push_front
(
ActiveKindRecord
{
anno
,
start_ns
,
end_ns
,
device_id
,
thread_id
,
correlation_id
});
}
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
int64_t
device_id
,
int64_t
stream_id
,
int64_t
device_id
,
int64_t
stream_id
,
uint32_t
correlation_id
)
{
uint32_t
correlation_id
)
{
...
@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -355,6 +378,7 @@ class DeviceTracerImpl : public DeviceTracer {
}
}
const
std
::
vector
<
int
>
cbids
{
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
...
@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -385,6 +409,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
.
clear
();
correlations_
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
}
}
void
GenEventKernelCudaElapsedTime
()
{
void
GenEventKernelCudaElapsedTime
()
{
...
@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -437,7 +462,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
cpu_records_
)
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
event
->
set_type
(
proto
::
Event
::
CPU
);
...
@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -447,6 +472,24 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
}
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
CPU
);
auto
c
=
correlations_
.
find
(
r
.
correlation_id
);
if
(
c
!=
correlations_
.
end
()
&&
c
->
second
!=
nullptr
)
{
event
->
set_name
(
c
->
second
->
name
());
event
->
set_detail_info
(
r
.
name
);
}
else
{
event
->
set_name
(
r
.
name
);
}
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_sub_device_id
(
r
.
thread_id
);
event
->
set_device_id
(
r
.
device_id
);
}
}
miss
=
find
=
0
;
miss
=
find
=
0
;
for
(
const
MemRecord
&
r
:
mem_records_
)
{
for
(
const
MemRecord
&
r
:
mem_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
auto
*
event
=
profile_pb
.
add_events
();
...
@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
...
@@ -510,6 +553,7 @@ class DeviceTracerImpl : public DeviceTracer {
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
correlations_pairs
;
correlations_pairs
;
std
::
unordered_map
<
uint32_t
,
Event
*>
correlations_
;
std
::
unordered_map
<
uint32_t
,
Event
*>
correlations_
;
...
@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
...
@@ -613,6 +657,7 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaUnbindTexture_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaSetupArgument_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunch_v3020
);
REGISTER_RUNTIME_CBID_STR
(
cudaDeviceGetPCIBusId_v4010
);
#if CUDA_VERSION >= 9000
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernel_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
REGISTER_RUNTIME_CBID_STR
(
cudaLaunchCooperativeKernelMultiDevice_v9000
);
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
7235fd66
...
@@ -63,7 +63,14 @@ class DeviceTracer {
...
@@ -63,7 +63,14 @@ class DeviceTracer {
uint32_t
correlation_id
;
uint32_t
correlation_id
;
uint64_t
bytes
;
uint64_t
bytes
;
};
};
struct
ActiveKindRecord
{
std
::
string
name
;
uint64_t
start_ns
;
uint64_t
end_ns
;
int64_t
device_id
;
int64_t
thread_id
;
uint32_t
correlation_id
;
};
virtual
~
DeviceTracer
()
{}
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
virtual
void
Enable
()
=
0
;
...
@@ -85,6 +92,10 @@ class DeviceTracer {
...
@@ -85,6 +92,10 @@ class DeviceTracer {
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
virtual
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
=
0
;
int64_t
thread_id
)
=
0
;
virtual
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
// added before for human readability.
...
...
tools/timeline.py
浏览文件 @
7235fd66
...
@@ -131,7 +131,7 @@ class Timeline(object):
...
@@ -131,7 +131,7 @@ class Timeline(object):
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
if
(
k
,
event
.
device_id
,
"CPU"
)
not
in
self
.
_devices
:
pid
=
self
.
_allocate_pid
()
pid
=
self
.
_allocate_pid
()
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
self
.
_devices
[(
k
,
event
.
device_id
,
"CPU"
)]
=
pid
# -1 device id represents CUDA
api call
# -1 device id represents CUDA
API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
if
event
.
device_id
==
-
1
:
if
event
.
device_id
==
-
1
:
self
.
_chrome_trace
.
emit_pid
(
"%s:cuda_api"
%
k
,
pid
)
self
.
_chrome_trace
.
emit_pid
(
"%s:cuda_api"
%
k
,
pid
)
else
:
else
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录