Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b0d12d99
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b0d12d99
编写于
12月 17, 2021
作者:
F
From00
提交者:
GitHub
12月 17, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add GetStream Interface for StreamSafeCUDAAllocator (#38195)
上级
20b7c99c
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
341 addition
and
223 deletion
+341
-223
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+92
-32
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+4
-1
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+111
-123
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+22
-24
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+7
-1
paddle/fluid/memory/malloc.h
paddle/fluid/memory/malloc.h
+4
-1
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+100
-40
未找到文件。
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
b0d12d99
...
@@ -42,7 +42,7 @@ endif()
...
@@ -42,7 +42,7 @@ endif()
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
if
(
WITH_GPU OR WITH_ROCM
)
if
(
WITH_GPU OR WITH_ROCM
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator
device_context
)
if
(
CUDA_VERSION GREATER_EQUAL 10.2
)
if
(
CUDA_VERSION GREATER_EQUAL 10.2
)
list
(
APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator
)
list
(
APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator
)
endif
()
endif
()
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
b0d12d99
...
@@ -31,6 +31,7 @@
...
@@ -31,6 +31,7 @@
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
...
@@ -175,12 +176,12 @@ class AllocatorFacadePrivate {
...
@@ -175,12 +176,12 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_
=
allow_free_idle_chunk
;
allow_free_idle_chunk_
=
allow_free_idle_chunk
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
default_streams_
=
std
::
vector
<
gpuStream_t
>
(
platform
::
GetGPUDeviceCount
(),
nullptr
);
// TODO(Ruibiao): Support multi-stream allocator for other strategies
// TODO(Ruibiao): Support multi-stream allocator for other strategies
default_stream_
=
nullptr
;
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
++
dev_id
)
{
++
dev_id
)
{
InitStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
InitStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
nullptr
);
default_stream_
);
}
}
}
else
{
}
else
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
...
@@ -285,15 +286,51 @@ class AllocatorFacadePrivate {
...
@@ -285,15 +286,51 @@ class AllocatorFacadePrivate {
return
stream_it
->
second
;
return
stream_it
->
second
;
}
}
gpuStream_t
GetDefaultStream
()
{
return
default_stream_
;
}
const
gpuStream_t
&
GetDefaultStream
(
const
platform
::
CUDAPlace
&
place
)
{
int
dev_id
=
place
.
GetDeviceId
();
gpuStream_t
&
default_stream
=
default_streams_
[
dev_id
];
if
(
UNLIKELY
(
default_stream
==
nullptr
))
{
/* NOTE(Ruibiao): Here if we set default_stream by code " default_stream =
* platform::stream::get_current_stream(place.GetDeviceId())->raw_stream()
* ", then it will be fail to make target 'jit_kernel_benchmark', says a
* undefined reference to `paddle::platform::DeviceContextPool::Get(
* paddle::platform::Place const&)' in function
* `paddle::platform::stream::get_current_stream(int)'. However, target
* allocator_facade will not be affected. It seems a circular dependency
* problem between 'cuda_stream' and 'device_context' that causes this
* strange bug.
*/
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
default_stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
))
->
stream
();
InitStreamSafeCUDAAllocator
(
place
,
default_stream
);
}
return
default_stream
;
}
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
)
{
StreamSafeCUDAAllocation
*
stream_safe_cuda_allocation
=
dynamic_cast
<
StreamSafeCUDAAllocation
*>
(
allocation
.
get
());
PADDLE_ENFORCE_NOT_NULL
(
stream_safe_cuda_allocation
,
platform
::
errors
::
InvalidArgument
(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*"
,
allocation
.
get
()));
stream_safe_cuda_allocation
->
RecordStream
(
stream
);
}
void
RecordStream
(
Allocation
*
allocation
,
const
gpuStream_t
&
stream
)
{
const
gpuStream_t
&
GetStream
(
PADDLE_ENFORCE_EQ
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
)
const
{
platform
::
is_gpu_place
(
allocation
->
place
()),
true
,
const
StreamSafeCUDAAllocation
*
stream_safe_cuda_allocation
=
dynamic_cast
<
const
StreamSafeCUDAAllocation
*>
(
allocation
.
get
());
PADDLE_ENFORCE_NOT_NULL
(
stream_safe_cuda_allocation
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"Not allow to record stream for an allocation with place %s"
,
"Failed to dynamic cast %p from Allocation* to "
allocation
->
place
()));
"StreamSafeCUDAAllocation*"
,
dynamic_cast
<
StreamSafeCUDAAllocation
*>
(
allocation
)
->
RecordStream
(
stream
);
allocation
.
get
()));
return
stream_safe_cuda_allocation
->
GetOwningStream
();
}
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -705,7 +742,7 @@ class AllocatorFacadePrivate {
...
@@ -705,7 +742,7 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor
// a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap
cuda_allocators_
;
CUDAAllocatorMap
cuda_allocators_
;
gpuStream_t
default_stream
_
;
std
::
vector
<
gpuStream_t
>
default_streams
_
;
SpinLock
cuda_allocators_lock_
;
SpinLock
cuda_allocators_lock_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
...
@@ -745,8 +782,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
...
@@ -745,8 +782,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
}
}
#endif
#endif
return
m_
->
GetAllocator
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
platform
::
CUDAPlace
cuda_place
=
m_
->
GetDefaultStream
());
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
);
return
m_
->
GetAllocator
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
}
}
#endif
#endif
...
@@ -769,8 +807,9 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
...
@@ -769,8 +807,9 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
}
}
#endif
#endif
return
Alloc
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
size
,
platform
::
CUDAPlace
cuda_place
=
m_
->
GetDefaultStream
());
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
);
return
Alloc
(
cuda_place
,
size
,
m_
->
GetDefaultStream
(
cuda_place
));
}
}
#endif
#endif
...
@@ -789,8 +828,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
...
@@ -789,8 +828,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
}
}
#endif
#endif
return
Release
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
platform
::
CUDAPlace
cuda_place
=
m_
->
GetDefaultStream
());
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
);
return
Release
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
}
}
#endif
#endif
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
...
@@ -804,9 +844,9 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
...
@@ -804,9 +844,9 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'AllocaShared' function. "
"multi-stream 'AllocaShared' function.
To enable it, you can enter
"
"
To enable it, you can enter 'export
"
"
'export FLAGS_use_stream_safe_cuda_allocator=true' in the
"
"
FLAGS_use_stream_safe_cuda_allocator=true' in the
terminal."
));
"terminal."
));
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
...
@@ -824,9 +864,9 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
...
@@ -824,9 +864,9 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Alloc
a' function.
"
"multi-stream 'Alloc
' function. To enable it, you can enter
"
"
To enable it, you can enter 'export
"
"
'export FLAGS_use_stream_safe_cuda_allocator=true' in the
"
"
FLAGS_use_stream_safe_cuda_allocator=true' in the
terminal."
));
"terminal."
));
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
...
@@ -836,7 +876,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
...
@@ -836,7 +876,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
#endif
#endif
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
return
m_
->
GetAllocator
(
place
,
stream
,
/* creat_if_not_found = */
true
)
return
m_
->
GetAllocator
(
place
,
stream
,
/* creat
e
_if_not_found = */
true
)
->
Allocate
(
size
);
->
Allocate
(
size
);
}
else
{
}
else
{
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
...
@@ -849,9 +889,9 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
...
@@ -849,9 +889,9 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Release' function. "
"multi-stream 'Release' function.
To enable it, you can enter
"
"
To enable it, you can enter 'export
"
"
'export FLAGS_use_stream_safe_cuda_allocator=true' in the
"
"
FLAGS_use_stream_safe_cuda_allocator=true' in the
terminal."
));
"terminal."
));
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
...
@@ -863,15 +903,15 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
...
@@ -863,15 +903,15 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
return
m_
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
return
m_
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
}
}
void
AllocatorFacade
::
RecordStream
(
Allocation
*
allocation
,
void
AllocatorFacade
::
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
)
{
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'RecordStream' function. "
"'RecordStream' function.
To enable it, you can enter
"
"
To enable it, you can enter 'export
"
"
'export FLAGS_use_stream_safe_cuda_allocator=true' in the
"
"
FLAGS_use_stream_safe_cuda_allocator=true' in the
terminal."
));
"terminal."
));
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
...
@@ -883,6 +923,26 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
...
@@ -883,6 +923,26 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
m_
->
RecordStream
(
allocation
,
stream
);
m_
->
RecordStream
(
allocation
,
stream
);
}
}
const
gpuStream_t
&
AllocatorFacade
::
GetStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
)
const
{
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
return
m_
->
GetStream
(
allocation
);
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
AllocatorFacade
::
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
void
AllocatorFacade
::
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
return
m_
->
PrepareMemoryPoolForCUDAGraph
(
id
);
return
m_
->
PrepareMemoryPoolForCUDAGraph
(
id
);
...
...
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
b0d12d99
...
@@ -64,7 +64,10 @@ class AllocatorFacade {
...
@@ -64,7 +64,10 @@ class AllocatorFacade {
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
);
const
gpuStream_t
&
stream
);
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
void
RecordStream
(
Allocation
*
allocation
,
const
gpuStream_t
&
stream
);
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
);
const
gpuStream_t
&
GetStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
)
const
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
);
void
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
);
void
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
);
void
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
);
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
浏览文件 @
b0d12d99
...
@@ -13,7 +13,6 @@
...
@@ -13,7 +13,6 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
@@ -24,36 +23,92 @@ StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
...
@@ -24,36 +23,92 @@ StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
:
Allocation
(
underlying_allocation
->
ptr
(),
underlying_allocation
->
size
(),
:
Allocation
(
underlying_allocation
->
ptr
(),
underlying_allocation
->
size
(),
underlying_allocation
->
place
()),
underlying_allocation
->
place
()),
underlying_allocation_
(
std
::
move
(
underlying_allocation
)),
underlying_allocation_
(
std
::
move
(
underlying_allocation
)),
owning_stream_
(
owning_stream
),
owning_stream_
(
std
::
move
(
owning_stream
))
{}
recorded_streams_
(
std
::
make_shared
<
std
::
set
<
gpuStream_t
>>
())
{}
void
StreamSafeCUDAAllocation
::
RecordStream
(
gpuStream_t
stream
)
{
void
StreamSafeCUDAAllocation
::
RecordStream
(
const
gpuStream_t
&
stream
)
{
VLOG
(
8
)
<<
"
Record stream "
<<
stream
<<
" to
"
<<
ptr
();
VLOG
(
8
)
<<
"
Try record stream "
<<
stream
<<
" for address
"
<<
ptr
();
if
(
stream
==
owning_stream_
)
{
if
(
stream
==
owning_stream_
)
{
VLOG
(
9
)
<<
"Record the same stream of "
<<
stream
;
return
;
return
;
}
}
std
::
lock_guard
<
SpinLock
>
lock_guard
(
spin_lock_
);
recorded_streams_
->
insert
(
stream
);
std
::
lock_guard
<
SpinLock
>
lock_guard
(
outstanding_event_map_lock_
);
gpuEvent_t
record_event
;
auto
it
=
outstanding_event_map_
.
find
(
stream
);
if
(
it
==
outstanding_event_map_
.
end
())
{
gpuEvent_t
new_event
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
new_event
,
cudaEventDisableTiming
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
new_event
,
hipEventDisableTiming
));
#endif
outstanding_event_map_
[
stream
]
=
new_event
;
record_event
=
new_event
;
VLOG
(
9
)
<<
"Create a new event "
<<
new_event
;
}
else
{
record_event
=
it
->
second
;
VLOG
(
9
)
<<
"Reuse event "
<<
record_event
;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
record_event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
record_event
,
stream
));
#endif
VLOG
(
8
)
<<
"Record event "
<<
record_event
<<
" to stream "
<<
stream
;
}
}
std
::
shared_ptr
<
std
::
set
<
gpuStream_t
>>
bool
StreamSafeCUDAAllocation
::
CanBeFreed
()
{
StreamSafeCUDAAllocation
::
GetRecordedStreams
()
{
// NOTE(Ruibiao): This function will not execute concurrently,
return
recorded_streams_
;
// so outstanding_event_lock_ is not required here
for
(
auto
it
=
outstanding_event_map_
.
begin
();
it
!=
outstanding_event_map_
.
end
();
++
it
)
{
gpuEvent_t
&
event
=
it
->
second
;
#ifdef PADDLE_WITH_CUDA
gpuError_t
err
=
cudaEventQuery
(
event
);
if
(
err
==
cudaErrorNotReady
)
{
VLOG
(
9
)
<<
"Event "
<<
event
<<
" for "
<<
ptr
()
<<
" is not completed"
;
// Erase the completded event before "it"
outstanding_event_map_
.
erase
(
outstanding_event_map_
.
begin
(),
it
);
return
false
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventDestroy
(
event
));
#else
gpuError_t
err
=
hipEventQuery
(
event
);
if
(
err
==
hipErrorNotReady
)
{
VLOG
(
9
)
<<
"Event "
<<
event
<<
" for "
<<
ptr
()
<<
" is not completed"
;
// Erase the completded event before "it"
outstanding_event_map_
.
erase
(
outstanding_event_map_
.
begin
(),
it
);
return
false
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventDestroy
(
event
));
#endif
VLOG
(
8
)
<<
"Destroy event "
<<
event
;
}
return
true
;
}
const
gpuStream_t
&
StreamSafeCUDAAllocation
::
GetOwningStream
()
const
{
return
owning_stream_
;
}
}
StreamSafeCUDAAllocator
::
StreamSafeCUDAAllocator
(
StreamSafeCUDAAllocator
::
StreamSafeCUDAAllocator
(
const
std
::
shared_ptr
<
Allocator
>&
underlying_allocator
,
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
platform
::
CUDAPlace
place
,
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
default_stream
)
gpuStream_t
default_stream
)
:
underlying_allocator_
(
underlying_allocator
),
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)
),
place_
(
place
),
place_
(
std
::
move
(
place
)
),
default_stream_
(
default_stream
)
{
default_stream_
(
std
::
move
(
default_stream
)
)
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator
s
_map_lock_
);
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
allocator
s
_map_
[
place
].
emplace_back
(
this
);
allocator_map_
[
place
].
emplace_back
(
this
);
}
}
StreamSafeCUDAAllocator
::~
StreamSafeCUDAAllocator
()
{
StreamSafeCUDAAllocator
::~
StreamSafeCUDAAllocator
()
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator
s
_map_lock_
);
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator
s
_map_
[
place_
];
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place_
];
allocators
.
erase
(
std
::
remove
(
allocators
.
begin
(),
allocators
.
end
(),
this
),
allocators
.
erase
(
std
::
remove
(
allocators
.
begin
(),
allocators
.
end
(),
this
),
allocators
.
end
());
allocators
.
end
());
}
}
...
@@ -61,147 +116,80 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
...
@@ -61,147 +116,80 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
bool
StreamSafeCUDAAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
bool
StreamSafeCUDAAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
Allocation
*
StreamSafeCUDAAllocator
::
AllocateImpl
(
size_t
size
)
{
Allocation
*
StreamSafeCUDAAllocator
::
AllocateImpl
(
size_t
size
)
{
Process
EventsAndFree
();
Process
UnfreedAllocations
();
AllocationPtr
underlying_allocation
;
AllocationPtr
underlying_allocation
;
try
{
try
{
underlying_allocation
=
underlying_allocator_
->
Allocate
(
size
);
underlying_allocation
=
underlying_allocator_
->
Allocate
(
size
);
}
catch
(
BadAlloc
&
)
{
}
catch
(
BadAlloc
&
)
{
VLOG
(
9
)
<<
"Allocation failed when allocating "
<<
size
<<
" bytes"
;
VLOG
(
4
)
<<
"Allocation failed when allocating "
<<
size
<<
" bytes"
;
uint64_t
release_size
=
ReleaseImpl
(
place_
);
ReleaseImpl
(
place_
);
VLOG
(
9
)
<<
"Release "
<<
release_size
<<
" bytes memory from all streams"
;
try
{
try
{
underlying_allocation
=
underlying_allocator_
->
Allocate
(
size
);
underlying_allocation
=
underlying_allocator_
->
Allocate
(
size
);
}
catch
(...)
{
}
catch
(...)
{
VLOG
(
9
)
<<
"Still allocation failed after release memory"
;
VLOG
(
3
)
<<
"Still allocation failed after release memory from all streams"
;
throw
;
throw
;
}
}
}
catch
(...)
{
}
catch
(...)
{
throw
;
throw
;
}
}
StreamSafeCUDAAllocation
*
allocation
=
new
StreamSafeCUDAAllocation
(
StreamSafeCUDAAllocation
*
allocation
=
new
StreamSafeCUDAAllocation
(
std
::
move
(
underlying_allocation
),
default_stream_
);
std
::
move
(
underlying_allocation
),
default_stream_
);
VLOG
(
8
)
<<
"Allocate "
<<
allocation
->
size
()
<<
" bytes at address "
<<
allocation
->
ptr
();
return
allocation
;
return
allocation
;
}
}
void
StreamSafeCUDAAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
void
StreamSafeCUDAAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
if
(
dynamic_cast
<
StreamSafeCUDAAllocation
*>
(
allocation
)
StreamSafeCUDAAllocation
*
stream_safe_cuda_allocation
=
->
GetRecordedStreams
()
dynamic_cast
<
StreamSafeCUDAAllocation
*>
(
allocation
);
->
empty
())
{
PADDLE_ENFORCE_NOT_NULL
(
stream_safe_cuda_allocation
,
delete
allocation
;
platform
::
errors
::
InvalidArgument
(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*"
,
allocation
));
VLOG
(
8
)
<<
"Try free allocation "
<<
stream_safe_cuda_allocation
->
ptr
();
if
(
stream_safe_cuda_allocation
->
CanBeFreed
())
{
delete
stream_safe_cuda_allocation
;
}
else
{
}
else
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
outstanding_events_map
_lock_
);
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation
_lock_
);
FreeStreamSafeCUDAAllocation
(
allocation
);
unfreed_allocations_
.
emplace_back
(
stream_safe_cuda_
allocation
);
}
}
}
}
uint64_t
StreamSafeCUDAAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
uint64_t
StreamSafeCUDAAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator
s
_map_lock_
);
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator
s
_map_
[
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)];
allocator_map_
[
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)];
uint64_t
release_size
=
0
;
uint64_t
release
d
_size
=
0
;
for
(
StreamSafeCUDAAllocator
*
allocator
:
allocators
)
{
for
(
StreamSafeCUDAAllocator
*
allocator
:
allocators
)
{
release
_size
+=
allocator
->
ProcessEventsAndFree
WithRelease
();
release
d_size
+=
allocator
->
ProcessUnfreedAllocations
WithRelease
();
}
}
VLOG
(
8
)
<<
"Release "
<<
release_size
VLOG
(
8
)
<<
"Release "
<<
released_size
<<
" bytes memory from all streams"
;
<<
" bytes memory from all stream for place "
<<
place
;
return
released_size
;
return
release_size
;
}
}
void
StreamSafeCUDAAllocator
::
CreateEventForAllRecordedStream
(
void
StreamSafeCUDAAllocator
::
ProcessUnfreedAllocations
()
{
std
::
set
<
gpuStream_t
>*
recorded_streams
,
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation_lock_
);
std
::
deque
<
gpuEvent_t
>*
outstanding_events
)
{
for
(
auto
it
=
unfreed_allocations_
.
begin
();
for
(
gpuStream_t
stream
:
*
recorded_streams
)
{
it
!=
unfreed_allocations_
.
end
();)
{
gpuEvent_t
event
;
if
((
*
it
)
->
CanBeFreed
())
{
#ifdef PADDLE_WITH_CUDA
delete
*
it
;
PADDLE_ENFORCE_GPU_SUCCESS
(
it
=
unfreed_allocations_
.
erase
(
it
);
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
event
,
hipEventDisableTiming
));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
event
,
stream
));
#endif
outstanding_events
->
emplace_back
(
event
);
VLOG
(
9
)
<<
"Record event "
<<
event
<<
" in stream "
<<
stream
;
}
recorded_streams
->
clear
();
}
void
StreamSafeCUDAAllocator
::
FreeStreamSafeCUDAAllocation
(
Allocation
*
allocation
)
{
std
::
deque
<
gpuEvent_t
>&
outstanding_events
=
outstanding_events_map_
[
allocation
];
CreateEventForAllRecordedStream
(
dynamic_cast
<
StreamSafeCUDAAllocation
*>
(
allocation
)
->
GetRecordedStreams
()
.
get
(),
&
outstanding_events
);
if
(
!
outstanding_events
.
empty
())
{
VLOG
(
8
)
<<
allocation
->
ptr
()
<<
" is not ready to free"
;
return
;
}
VLOG
(
8
)
<<
"Free "
<<
allocation
->
ptr
();
outstanding_events_map_
.
erase
(
allocation
);
delete
allocation
;
}
void
StreamSafeCUDAAllocator
::
ProcessEventsAndFree
()
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
outstanding_events_map_lock_
);
for
(
auto
map_it
=
outstanding_events_map_
.
begin
();
map_it
!=
outstanding_events_map_
.
end
();)
{
std
::
deque
<
gpuEvent_t
>&
outstanding_events
=
map_it
->
second
;
VLOG
(
10
)
<<
"Check "
<<
outstanding_events
.
size
()
<<
" outstanding events for "
<<
map_it
->
first
->
ptr
();
auto
deque_it
=
outstanding_events
.
begin
();
while
(
deque_it
!=
outstanding_events
.
end
())
{
#ifdef PADDLE_WITH_CUDA
gpuError_t
err
=
cudaEventQuery
(
*
deque_it
);
if
(
err
==
cudaErrorNotReady
)
{
VLOG
(
10
)
<<
"Event "
<<
*
deque_it
<<
" for "
<<
map_it
->
first
->
ptr
()
<<
" is not completed"
;
outstanding_events
.
erase
(
outstanding_events
.
begin
(),
deque_it
);
break
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventDestroy
(
*
deque_it
));
#else
gpuError_t
err
=
hipEventQuery
(
*
deque_it
);
if
(
err
==
hipErrorNotReady
)
{
VLOG
(
10
)
<<
"Event "
<<
*
deque_it
<<
" for "
<<
map_it
->
first
->
ptr
()
<<
" is not completed"
;
// Erase the completded event before "deque_it"
outstanding_events
.
erase
(
outstanding_events
.
begin
(),
deque_it
);
break
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventDestroy
(
*
deque_it
));
#endif
++
deque_it
;
}
if
(
deque_it
==
outstanding_events
.
end
())
{
outstanding_events
.
clear
();
Allocation
*
allocation
=
map_it
->
first
;
// "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation
auto
next_it
=
++
map_it
;
FreeStreamSafeCUDAAllocation
(
allocation
);
map_it
=
next_it
;
}
else
{
}
else
{
++
map_
it
;
++
it
;
}
}
}
}
}
}
uint64_t
StreamSafeCUDAAllocator
::
Process
EventsAndFree
WithRelease
()
{
uint64_t
StreamSafeCUDAAllocator
::
Process
UnfreedAllocations
WithRelease
()
{
Process
EventsAndFree
();
Process
UnfreedAllocations
();
return
underlying_allocator_
->
Release
(
place_
);
return
underlying_allocator_
->
Release
(
place_
);
}
}
std
::
map
<
platform
::
CUDAPlace
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
std
::
map
<
platform
::
CUDAPlace
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
StreamSafeCUDAAllocator
::
allocator
s
_map_
;
StreamSafeCUDAAllocator
::
allocator_map_
;
SpinLock
StreamSafeCUDAAllocator
::
allocator
s
_map_lock_
;
SpinLock
StreamSafeCUDAAllocator
::
allocator_map_lock_
;
}
// namespace allocation
}
// namespace allocation
}
// namespace memory
}
// namespace memory
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
浏览文件 @
b0d12d99
...
@@ -13,21 +13,21 @@
...
@@ -13,21 +13,21 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <deque>
#include <deque>
#include <list>
#include <map>
#include <map>
#include <memory>
#include <mutex>
#include <mutex>
#include <set>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
...
@@ -36,21 +36,23 @@ class StreamSafeCUDAAllocation : public Allocation {
...
@@ -36,21 +36,23 @@ class StreamSafeCUDAAllocation : public Allocation {
public:
public:
StreamSafeCUDAAllocation
(
AllocationPtr
underlying_allocation
,
StreamSafeCUDAAllocation
(
AllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
);
gpuStream_t
owning_stream
);
void
RecordStream
(
gpuStream_t
stream
);
void
RecordStream
(
const
gpuStream_t
&
stream
);
std
::
shared_ptr
<
std
::
set
<
gpuStream_t
>>
GetRecordedStreams
();
bool
CanBeFreed
();
const
gpuStream_t
&
GetOwningStream
()
const
;
private:
private:
AllocationPtr
underlying_allocation_
;
AllocationPtr
underlying_allocation_
;
std
::
map
<
gpuStream_t
,
gpuEvent_t
>
outstanding_event_map_
;
gpuStream_t
owning_stream_
;
gpuStream_t
owning_stream_
;
std
::
shared_ptr
<
std
::
set
<
gpuStream_t
>>
recorded_streams_
;
SpinLock
outstanding_event_map_lock_
;
SpinLock
spin_lock_
;
};
};
class
StreamSafeCUDAAllocator
:
public
Allocator
{
class
StreamSafeCUDAAllocator
:
public
Allocator
{
public:
public:
StreamSafeCUDAAllocator
(
StreamSafeCUDAAllocator
(
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
const
std
::
shared_ptr
<
Allocator
>
&
underlying_allocator
,
platform
::
CUDAPlace
place
,
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
default_stream
);
gpuStream_t
default_stream
);
~
StreamSafeCUDAAllocator
();
~
StreamSafeCUDAAllocator
();
bool
IsAllocThreadSafe
()
const
override
;
bool
IsAllocThreadSafe
()
const
override
;
...
@@ -60,22 +62,18 @@ class StreamSafeCUDAAllocator : public Allocator {
...
@@ -60,22 +62,18 @@ class StreamSafeCUDAAllocator : public Allocator {
uint64_t
ReleaseImpl
(
const
platform
::
Place
&
place
)
override
;
uint64_t
ReleaseImpl
(
const
platform
::
Place
&
place
)
override
;
private:
private:
void
CreateEventForAllRecordedStream
(
void
ProcessUnfreedAllocations
();
std
::
set
<
gpuStream_t
>
*
recorded_streams
,
uint64_t
ProcessUnfreedAllocationsWithRelease
();
std
::
deque
<
gpuEvent_t
>
*
outstanding_events
);
void
FreeStreamSafeCUDAAllocation
(
Allocation
*
allocation
);
void
ProcessEventsAndFree
();
uint64_t
ProcessEventsAndFreeWithRelease
();
static
std
::
map
<
platform
::
CUDAPlace
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
static
std
::
map
<
platform
::
CUDAPlace
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
allocator
s
_map_
;
allocator_map_
;
static
SpinLock
allocator
s
_map_lock_
;
static
SpinLock
allocator_map_lock_
;
std
::
shared_ptr
<
Allocator
>
underlying_allocator_
;
std
::
shared_ptr
<
Allocator
>
underlying_allocator_
;
platform
::
CUDAPlace
place_
;
platform
::
CUDAPlace
place_
;
gpuStream_t
default_stream_
;
gpuStream_t
default_stream_
;
std
::
map
<
Allocation
*
,
std
::
deque
<
gpuEvent_t
>>
outstanding_events_map
_
;
std
::
list
<
StreamSafeCUDAAllocation
*>
unfreed_allocations
_
;
SpinLock
outstanding_events_map
_lock_
;
SpinLock
unfreed_allocation
_lock_
;
};
};
}
// namespace allocation
}
// namespace allocation
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
b0d12d99
...
@@ -50,10 +50,16 @@ uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
...
@@ -50,10 +50,16 @@ uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
return
allocation
::
AllocatorFacade
::
Instance
().
Release
(
place
,
stream
);
return
allocation
::
AllocatorFacade
::
Instance
().
Release
(
place
,
stream
);
}
}
void
RecordStream
(
Allocation
*
allocation
,
const
gpuStream_t
&
stream
)
{
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
RecordStream
(
allocation
,
return
allocation
::
AllocatorFacade
::
Instance
().
RecordStream
(
allocation
,
stream
);
stream
);
}
}
const
gpuStream_t
&
GetStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
GetStream
(
allocation
);
}
#endif
#endif
}
// namespace memory
}
// namespace memory
}
// namespace paddle
}
// namespace paddle
paddle/fluid/memory/malloc.h
浏览文件 @
b0d12d99
...
@@ -51,7 +51,10 @@ extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
...
@@ -51,7 +51,10 @@ extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
extern
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
extern
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
const
gpuStream_t
&
stream
);
void
RecordStream
(
Allocation
*
allocation
,
const
gpuStream_t
&
stream
);
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
);
const
gpuStream_t
&
GetStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
);
#endif
#endif
}
// namespace memory
}
// namespace memory
}
// namespace paddle
}
// namespace paddle
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
浏览文件 @
b0d12d99
...
@@ -29,14 +29,16 @@
...
@@ -29,14 +29,16 @@
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
__global__
void
add_kernel
(
int
*
x
,
int
n
)
{
__global__
void
add_kernel
(
int
*
x
,
int
n
)
{
int
tid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
thread_num
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
i
=
tid
;
i
<
n
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atomicAdd
(
x
+
i
,
tid
);
for
(
int
i
=
thread_id
;
i
<
n
;
i
+=
thread_num
)
{
atomicAdd
(
x
+
i
,
thread_id
);
}
}
}
}
...
@@ -54,26 +56,21 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
...
@@ -54,26 +56,21 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
place_
=
platform
::
CUDAPlace
();
place_
=
platform
::
CUDAPlace
();
stream_num_
=
64
;
stream_num_
=
64
;
grid_num_
=
1
;
grid_num_
=
1
;
block_num_
=
64
;
block_num_
=
32
;
data_num_
=
64
;
data_num_
=
131072
;
default_stream
=
nullptr
;
workspace_size_
=
data_num_
*
sizeof
(
int
)
;
streams_
.
reserve
(
stream_num_
);
// alloc workspace for each stream
streams_
.
emplace_back
(
default_stream
);
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
i
=
1
;
i
<
stream_num_
;
++
i
)
{
gpuStream_t
stream
;
gpuStream_t
stream
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#else
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
#endif
#endif
streams_
.
emplace_back
(
stream
);
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
size_t
allocation_size
=
data_num_
*
sizeof
(
int
);
std
::
shared_ptr
<
Allocation
>
allocation
=
std
::
shared_ptr
<
Allocation
>
allocation
=
AllocShared
(
place_
,
allocation_size
,
streams_
[
i
]
);
AllocShared
(
place_
,
workspace_size_
,
stream
);
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
cudaMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
...
@@ -81,25 +78,45 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
...
@@ -81,25 +78,45 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
hipMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
#endif
#endif
allocations_
.
emplace_back
(
allocation
);
streams_
.
emplace_back
(
stream
);
workspaces_
.
emplace_back
(
allocation
);
}
}
result_
=
AllocShared
(
place_
,
stream_num_
*
workspace_size_
);
}
}
void
SingleStreamRun
(
size_t
idx
)
{
void
SingleStreamRun
(
size_t
idx
)
{
// for all stream i,
// stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
int
*
x
=
reinterpret_cast
<
int
*>
(
allocation
s_
[
i
]
->
ptr
());
int
*
x
=
reinterpret_cast
<
int
*>
(
workspace
s_
[
i
]
->
ptr
());
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
x
,
data_num_
);
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
x
,
data_num_
);
if
(
i
!=
idx
)
{
RecordStream
(
workspaces_
[
i
],
streams_
[
idx
]);
RecordStream
(
allocations_
[
i
].
get
(),
streams_
[
idx
]);
}
}
}
}
void
CopyResultAsync
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
cudaMemcpyDeviceToDevice
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
hipMemcpyDeviceToDevice
));
#endif
}
}
}
void
MultiStreamRun
()
{
void
MultiStreamRun
()
{
for
(
in
t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
SingleStreamRun
(
i
);
SingleStreamRun
(
i
);
}
}
allocations_
.
clear
();
// fast_gc
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
}
void
MultiThreadMUltiStreamRun
()
{
void
MultiThreadMUltiStreamRun
()
{
...
@@ -111,28 +128,30 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
...
@@ -111,28 +128,30 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
[
i
].
join
();
threads
[
i
].
join
();
}
}
allocations_
.
clear
();
// fast_gc
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
}
void
CheckResult
()
{
void
CheckResult
()
{
auto
host_x
=
std
::
unique_ptr
<
int
[]
>
(
new
int
[
data_num_
]);
auto
result_host
=
std
::
unique_ptr
<
int
[]
>
(
new
int
[
result_
->
size
()]);
size_t
thread_num
=
grid_num_
*
block_num_
;
for
(
int
i
=
0
;
i
<
stream_num_
;
++
i
)
{
// tricky code, the allocations are still accessible even though
// allocations_.clear() has been called
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
cudaMemcpy
(
host_x
.
get
(),
allocations_
[
i
]
->
ptr
(),
result_
->
size
(),
data_num_
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
));
cudaMemcpyDeviceToHost
));
#else
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpy
(
host_x
.
get
(),
allocations_
[
i
]
->
ptr
(),
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
data_num_
*
sizeof
(
int
),
result_
->
size
(
),
hipMemcpyDeviceToHost
));
hipMemcpyDeviceToHost
));
#endif
#endif
for
(
int
j
=
0
;
j
<
data_num_
;
++
j
)
{
size_t
thread_num
=
grid_num_
*
block_num_
;
EXPECT_TRUE
(
host_x
[
j
]
==
(
j
%
thread_num
)
*
stream_num_
);
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
j
=
0
;
j
<
data_num_
;
++
j
)
{
EXPECT_TRUE
(
result_host
[
i
*
stream_num_
+
j
]
==
(
j
%
thread_num
)
*
stream_num_
);
}
}
}
}
result_
.
reset
();
}
}
void
TearDown
()
override
{
void
TearDown
()
override
{
...
@@ -160,10 +179,11 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
...
@@ -160,10 +179,11 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
size_t
grid_num_
;
size_t
grid_num_
;
size_t
block_num_
;
size_t
block_num_
;
size_t
data_num_
;
size_t
data_num_
;
size_t
workspace_size_
;
platform
::
CUDAPlace
place_
;
platform
::
CUDAPlace
place_
;
gpuStream_t
default_stream
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
std
::
shared_ptr
<
Allocation
>>
allocations_
;
std
::
vector
<
std
::
shared_ptr
<
Allocation
>>
workspaces_
;
std
::
shared_ptr
<
Allocation
>
result_
;
};
};
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
...
@@ -187,7 +207,10 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
...
@@ -187,7 +207,10 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
void
*
address
=
allocation_implicit_stream
->
ptr
();
void
*
address
=
allocation_implicit_stream
->
ptr
();
allocation_implicit_stream
.
reset
();
allocation_implicit_stream
.
reset
();
gpuStream_t
default_stream
=
nullptr
;
gpuStream_t
default_stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
allocation
::
AllocationPtr
allocation_unique
=
allocation
::
AllocationPtr
allocation_unique
=
Alloc
(
place
,
alloc_size
,
default_stream
);
Alloc
(
place
,
alloc_size
,
default_stream
);
EXPECT_GE
(
allocation_unique
->
size
(),
alloc_size
);
EXPECT_GE
(
allocation_unique
->
size
(),
alloc_size
);
...
@@ -220,6 +243,41 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
...
@@ -220,6 +243,41 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
CheckMemLeak
(
place
);
CheckMemLeak
(
place
);
}
}
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
GetStreamInterfaceTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
size_t
alloc_size
=
256
;
gpuStream_t
default_stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
std
::
shared_ptr
<
Allocation
>
allocation_implicit_stream
=
AllocShared
(
place
,
alloc_size
);
EXPECT_EQ
(
GetStream
(
allocation_implicit_stream
),
default_stream
);
gpuStream_t
new_stream
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
new_stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
new_stream
));
#endif
std
::
shared_ptr
<
Allocation
>
allocation_new_stream
=
AllocShared
(
place
,
alloc_size
,
new_stream
);
EXPECT_EQ
(
GetStream
(
allocation_new_stream
),
new_stream
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
new_stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
new_stream
));
#endif
allocation_implicit_stream
.
reset
();
allocation_new_stream
.
reset
();
Release
(
place
);
CheckMemLeak
(
place
);
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
CUDAGraphExceptionTest
)
{
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
CUDAGraphExceptionTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
...
@@ -237,8 +295,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
...
@@ -237,8 +295,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
,
nullptr
),
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
RecordStream
(
allocation
.
get
()
,
nullptr
),
EXPECT_THROW
(
RecordStream
(
allocation
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
GetStream
(
allocation
),
paddle
::
platform
::
EnforceNotMet
);
platform
::
EndCUDAGraphCapture
();
platform
::
EndCUDAGraphCapture
();
allocation
.
reset
();
allocation
.
reset
();
...
@@ -258,7 +317,8 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
...
@@ -258,7 +317,8 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream2
));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream2
));
#endif
#endif
size_t
available_size
=
platform
::
GpuAvailableMemToAlloc
();
size_t
available_size
=
platform
::
GpuAvailableMemToAlloc
();
// alloc_size < available_size < 2 * alloc_size
// alloc_size < available_size < 2 * alloc_size,
// so the second alloc will fail and retry
size_t
alloc_size
=
available_size
/
4
*
3
;
size_t
alloc_size
=
available_size
/
4
*
3
;
std
::
shared_ptr
<
Allocation
>
allocation1
=
std
::
shared_ptr
<
Allocation
>
allocation1
=
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录