Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a51c3272
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a51c3272
编写于
2月 11, 2019
作者:
D
Dun
提交者:
GitHub
2月 11, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #15664 from cjld/my_checkpoint
Fix Pr #15296 async buffered reader
上级
bec68fa0
ceec1356
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
68 addition
and
6 deletion
+68
-6
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+1
-1
paddle/fluid/memory/allocation/pinned_allocator.cc
paddle/fluid/memory/allocation/pinned_allocator.cc
+1
-1
paddle/fluid/memory/allocation/pinned_allocator.h
paddle/fluid/memory/allocation/pinned_allocator.h
+1
-1
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+2
-2
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+55
-1
paddle/fluid/operators/reader/buffered_reader.h
paddle/fluid/operators/reader/buffered_reader.h
+8
-0
未找到文件。
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
a51c3272
...
@@ -257,7 +257,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
...
@@ -257,7 +257,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
if
(
ptr
==
nullptr
)
{
LOG
(
WARNING
)
<<
"cuda
MallocHost
Cannot allocate "
<<
size
LOG
(
WARNING
)
<<
"cuda
HostAlloc
Cannot allocate "
<<
size
<<
" bytes in CUDAPinnedPlace"
;
<<
" bytes in CUDAPinnedPlace"
;
}
}
if
(
FLAGS_init_allocated_mem
)
{
if
(
FLAGS_init_allocated_mem
)
{
...
...
paddle/fluid/memory/allocation/pinned_allocator.cc
浏览文件 @
a51c3272
...
@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
...
@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void
*
ptr
;
void
*
ptr
;
PADDLE_ENFORCE
(
cuda
MallocHost
(
&
ptr
,
siz
e
));
PADDLE_ENFORCE
(
cuda
HostAlloc
(
&
ptr
,
size
,
cudaHostAllocPortabl
e
));
return
new
CPUPinnedAllocation
(
ptr
,
size
);
return
new
CPUPinnedAllocation
(
ptr
,
size
);
}
}
}
// namespace allocation
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/pinned_allocator.h
浏览文件 @
a51c3272
...
@@ -19,7 +19,7 @@ namespace paddle {
...
@@ -19,7 +19,7 @@ namespace paddle {
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
// Allocator uses `cuda
MallocHost
`
// Allocator uses `cuda
HostAlloc
`
class
CPUPinnedAllocation
:
public
Allocation
{
class
CPUPinnedAllocation
:
public
Allocation
{
public:
public:
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
a51c3272
...
@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
...
@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void
*
p
;
void
*
p
;
// PINNED memory is visible to all CUDA contexts.
// PINNED memory is visible to all CUDA contexts.
cudaError_t
result
=
cuda
MallocHost
(
&
p
,
siz
e
);
cudaError_t
result
=
cuda
HostAlloc
(
&
p
,
size
,
cudaHostAllocPortabl
e
);
if
(
result
==
cudaSuccess
)
{
if
(
result
==
cudaSuccess
)
{
*
index
=
1
;
// PINNED memory
*
index
=
1
;
// PINNED memory
cuda_pinnd_alloc_size_
+=
size
;
cuda_pinnd_alloc_size_
+=
size
;
return
p
;
return
p
;
}
else
{
}
else
{
LOG
(
WARNING
)
<<
"cuda
MallocHost
failed."
;
LOG
(
WARNING
)
<<
"cuda
HostAlloc
failed."
;
return
nullptr
;
return
nullptr
;
}
}
...
...
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
a51c3272
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() {
...
@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() {
position_
.
front
().
wait
();
position_
.
front
().
wait
();
position_
.
pop
();
position_
.
pop
();
}
}
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream
));
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaEventDestroy
(
event
));
}
#endif
}
}
BufferedReader
::
BufferedReader
(
BufferedReader
::
BufferedReader
(
...
@@ -33,6 +41,19 @@ BufferedReader::BufferedReader(
...
@@ -33,6 +41,19 @@ BufferedReader::BufferedReader(
thread_pool_
(
1
),
thread_pool_
(
1
),
place_
(
place
),
place_
(
place
),
buffer_size_
(
buffer_size
)
{
buffer_size_
(
buffer_size
)
{
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
compute_stream
=
((
platform
::
CUDADeviceContext
*
)(
platform
::
DeviceContextPool
::
Instance
()
.
Get
(
place_
)))
->
stream
();
events
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
}
#endif
cpu_buffer_
.
resize
(
buffer_size
);
cpu_buffer_
.
resize
(
buffer_size
);
gpu_buffer_
.
resize
(
buffer_size
);
gpu_buffer_
.
resize
(
buffer_size
);
ReadTillBufferFullAsync
();
ReadTillBufferFullAsync
();
...
@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
...
@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
}
}
void
BufferedReader
::
ReadAsync
(
size_t
i
)
{
void
BufferedReader
::
ReadAsync
(
size_t
i
)
{
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaEventRecord
(
events
[
i
],
compute_stream
));
}
#endif
position_
.
emplace
(
thread_pool_
.
enqueue
([
this
,
i
]()
->
size_t
{
position_
.
emplace
(
thread_pool_
.
enqueue
([
this
,
i
]()
->
size_t
{
TensorVec
&
cpu
=
cpu_buffer_
[
i
];
TensorVec
&
cpu
=
cpu_buffer_
[
i
];
reader_
->
ReadNext
(
&
cpu
);
reader_
->
ReadNext
(
&
cpu
);
...
@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) {
return
-
1UL
;
return
-
1UL
;
}
}
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
gpu
.
resize
(
cpu
.
size
());
gpu
.
resize
(
cpu
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
framework
::
TensorCopySync
(
cpu
[
i
],
place_
,
&
gpu
[
i
]);
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
auto
cpu_place
=
cpu
[
i
].
place
();
auto
cpu_ptr
=
cpu
[
i
].
data
<
void
>
();
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
size
=
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
0
);
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
}
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
}
}
#endif
return
i
;
return
i
;
}));
}));
}
}
...
...
paddle/fluid/operators/reader/buffered_reader.h
浏览文件 @
a51c3272
...
@@ -19,6 +19,9 @@
...
@@ -19,6 +19,9 @@
#include <vector>
#include <vector>
#include "ThreadPool.h"
#include "ThreadPool.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader {
...
@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader {
std
::
vector
<
TensorVec
>
cpu_buffer_
;
std
::
vector
<
TensorVec
>
cpu_buffer_
;
std
::
vector
<
TensorVec
>
gpu_buffer_
;
std
::
vector
<
TensorVec
>
gpu_buffer_
;
size_t
prev_pos_
{
-
1UL
};
size_t
prev_pos_
{
-
1UL
};
#ifdef PADDLE_WITH_CUDA
cudaStream_t
stream
;
cudaStream_t
compute_stream
;
std
::
vector
<
cudaEvent_t
>
events
;
#endif
};
};
}
// namespace reader
}
// namespace reader
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录