Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
69875dc4
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
69875dc4
编写于
2月 01, 2021
作者:
Q
Qi Li
提交者:
GitHub
2月 01, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid memory for rocm35 (part1), test=develop (#30758)
上级
5b1ab51c
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
138 addition
and
62 deletion
+138
-62
paddle/fluid/memory/CMakeLists.txt
paddle/fluid/memory/CMakeLists.txt
+6
-0
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+7
-7
paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
...luid/memory/allocation/allocator_facade_abs_flags_test.cc
+3
-3
paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
...uid/memory/allocation/allocator_facade_frac_flags_test.cc
+3
-3
paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
.../allocation/auto_growth_best_fit_allocator_facade_test.cc
+4
-4
paddle/fluid/memory/allocation/cuda_allocator.cc
paddle/fluid/memory/allocation/cuda_allocator.cc
+9
-1
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
...e/fluid/memory/allocation/cuda_device_context_allocator.h
+18
-6
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+17
-13
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
.../fluid/memory/allocation/naive_best_fit_allocator_test.cc
+1
-1
paddle/fluid/memory/allocation/pinned_allocator.cc
paddle/fluid/memory/allocation/pinned_allocator.cc
+8
-0
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+2
-2
paddle/fluid/memory/detail/CMakeLists.txt
paddle/fluid/memory/detail/CMakeLists.txt
+5
-3
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+2
-2
paddle/fluid/memory/detail/buddy_allocator_test.cc
paddle/fluid/memory/detail/buddy_allocator_test.cc
+20
-2
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+20
-6
paddle/fluid/memory/detail/system_allocator.h
paddle/fluid/memory/detail/system_allocator.h
+1
-1
paddle/fluid/memory/detail/system_allocator_test.cc
paddle/fluid/memory/detail/system_allocator_test.cc
+5
-1
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+5
-5
paddle/fluid/memory/memcpy.h
paddle/fluid/memory/memcpy.h
+2
-2
未找到文件。
paddle/fluid/memory/CMakeLists.txt
浏览文件 @
69875dc4
...
...
@@ -19,6 +19,12 @@ if (WITH_GPU)
DEPS device_context malloc
)
endif
()
if
(
WITH_ROCM
)
hip_test
(
malloc_test
SRCS malloc_test.cu
DEPS device_context malloc
)
endif
()
#if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif()
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
69875dc4
...
...
@@ -31,7 +31,7 @@
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...
...
@@ -70,7 +70,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetCUDADeviceCount
();
++
dev_id
)
{
InitNaiveBestFitCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
));
...
...
@@ -87,7 +87,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetCUDADeviceCount
();
++
dev_id
)
{
InitAutoGrowthCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
));
...
...
@@ -104,7 +104,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetCUDADeviceCount
();
++
dev_id
)
{
InitThreadLocalCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
));
...
...
@@ -152,7 +152,7 @@ class AllocatorFacadePrivate {
system_allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
#endif
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
system_allocators_
[
platform
::
CUDAPinnedPlace
()]
=
std
::
make_shared
<
CPUPinnedAllocator
>
();
int
device_count
=
platform
::
GetCUDADeviceCount
();
...
...
@@ -168,7 +168,7 @@ class AllocatorFacadePrivate {
std
::
make_shared
<
NaiveBestFitAllocator
>
(
platform
::
CPUPlace
());
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
InitNaiveBestFitCUDAPinnedAllocator
()
{
allocators_
[
platform
::
CUDAPinnedPlace
()]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
platform
::
CUDAPinnedPlace
());
...
...
@@ -215,7 +215,7 @@ class AllocatorFacadePrivate {
void
InitZeroSizeAllocators
()
{
std
::
vector
<
platform
::
Place
>
places
;
places
.
emplace_back
(
platform
::
CPUPlace
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int
device_count
=
platform
::
GetCUDADeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
places
.
emplace_back
(
platform
::
CUDAPlace
(
dev_id
));
...
...
paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
浏览文件 @
69875dc4
...
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
...
...
@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ
(
cpu_allocation
->
size
(),
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
place
=
platform
::
CUDAPlace
(
0
);
size
=
1024
;
...
...
@@ -81,7 +81,7 @@ void AllocateTestCases() {
}
TEST
(
Allocator
,
SpecifyGpuMemory
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use
=
0.0
;
...
...
paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
浏览文件 @
69875dc4
...
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
...
...
@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ
(
cpu_allocation
->
size
(),
size
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
place
=
platform
::
CUDAPlace
(
0
);
size
=
1024
;
...
...
@@ -81,7 +81,7 @@ void AllocateTestCases() {
}
TEST
(
Allocator
,
Allocator
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
FLAGS_gpu_allocator_retry_time
=
500
;
FLAGS_fraction_of_cuda_pinned_memory_to_use
=
0.5
;
...
...
paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
浏览文件 @
69875dc4
...
...
@@ -22,7 +22,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/gpu_info.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_int64
(
gpu_allocator_retry_time
);
...
...
@@ -40,7 +40,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
}
TEST
(
allocator
,
allocator
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
FLAGS_gpu_allocator_retry_time
=
500
;
FLAGS_fraction_of_cuda_pinned_memory_to_use
=
0.5
;
...
...
@@ -62,7 +62,7 @@ TEST(allocator, allocator) {
ASSERT_EQ
(
cpu_allocation
->
size
(),
AlignedSize
(
size
,
1024
));
}
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
{
place
=
platform
::
CUDAPlace
(
0
);
size
=
1024
;
...
...
@@ -101,7 +101,7 @@ TEST(allocator, allocator) {
TEST
(
multithread_allocate
,
test_segfault
)
{
FLAGS_allocator_strategy
=
"auto_growth"
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std
::
mutex
mtx
;
std
::
condition_variable
cv
;
bool
flag
=
false
;
...
...
paddle/fluid/memory/allocation/cuda_allocator.cc
浏览文件 @
69875dc4
...
...
@@ -13,8 +13,16 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -39,7 +47,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
void
*
ptr
;
auto
result
=
platform
::
RecordedCudaMalloc
(
&
ptr
,
size
,
place_
.
device
);
if
(
LIKELY
(
result
==
cuda
Success
))
{
if
(
LIKELY
(
result
==
gpu
Success
))
{
return
new
Allocation
(
ptr
,
size
,
platform
::
Place
(
place_
));
}
...
...
paddle/fluid/memory/allocation/cuda_device_context_allocator.h
浏览文件 @
69875dc4
...
...
@@ -14,8 +14,6 @@
#pragma once
#include <cuda_runtime.h>
#include <map>
#include <memory>
#include <utility>
...
...
@@ -79,17 +77,26 @@ class CUDADeviceContextAllocation : public Allocation {
class
CUDADeviceContextAllocator
:
public
Allocator
{
public:
explicit
CUDADeviceContextAllocator
(
platform
::
CUDAPlace
place
,
cuda
Stream_t
default_stream
)
gpu
Stream_t
default_stream
)
:
place_
(
place
),
default_stream_
(
default_stream
)
{
platform
::
CUDADeviceGuard
guard
(
place_
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventCreateWithFlags
(
&
event_
,
hipEventDisableTiming
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreate
(
&
event_
,
cudaEventDisableTiming
));
#endif
}
~
CUDADeviceContextAllocator
()
{
if
(
event_
)
{
platform
::
CUDADeviceGuard
guard
(
place_
.
device
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventDestroy
(
event_
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventDestroy
(
event_
));
#endif
}
}
...
...
@@ -102,10 +109,15 @@ class CUDADeviceContextAllocator : public Allocator {
platform
::
CUDADeviceGuard
guard
(
place_
.
device
);
auto
allocation
=
new
CUDADeviceContextAllocation
(
memory
::
Alloc
(
place_
,
size
));
// Wait for the event on stream
// Wait for the event on stream
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipEventRecord
(
event_
,
default_stream_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipStreamWaitEvent
(
default_stream_
,
event_
,
0
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event_
,
default_stream_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamWaitEvent
(
default_stream_
,
event_
,
0
));
#endif
return
allocation
;
}
...
...
@@ -113,8 +125,8 @@ class CUDADeviceContextAllocator : public Allocator {
private:
platform
::
CUDAPlace
place_
;
cuda
Event_t
event_
{
nullptr
};
cuda
Stream_t
default_stream_
{
nullptr
};
gpu
Event_t
event_
{
nullptr
};
gpu
Stream_t
default_stream_
{
nullptr
};
};
/**
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
69875dc4
...
...
@@ -26,7 +26,7 @@
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_XPU
...
...
@@ -216,7 +216,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
GPUBuddyAllocatorList
{
private:
GPUBuddyAllocatorList
()
:
devices_
(
platform
::
GetSelectedDevices
())
{
...
...
@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
template
<
>
size_t
Used
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
)
{
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
return
GetGPUBuddyAllocator
(
place
.
device
)
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
template
<
>
void
*
Alloc
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
...
...
@@ -311,7 +311,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
))));
}
else
{
if
(
FLAGS_init_allocated_mem
)
{
#ifdef PADDLE_WITH_HIP
hipMemset
(
ptr
,
0xEF
,
size
);
#else
cudaMemset
(
ptr
,
0xEF
,
size
);
#endif
}
}
return
ptr
;
...
...
@@ -324,7 +328,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
template
<
>
void
Free
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
void
*
p
,
size_t
size
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -334,7 +338,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
template
<
>
uint64_t
Release
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
GetGPUBuddyAllocator
(
place
.
device
)
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -342,7 +346,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#endif
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
BuddyAllocator
*
GetCUDAPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
...
...
@@ -360,7 +364,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
template
<
>
size_t
Used
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
GetCUDAPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -371,7 +375,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
template
<
>
void
*
Alloc
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
size_t
size
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto
*
buddy_allocator
=
GetCUDAPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
...
...
@@ -392,7 +396,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template
<
>
void
Free
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
void
*
p
,
size_t
size
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator
()
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -403,7 +407,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template
<
>
uint64_t
Release
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
)
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
GetCUDAPinnedBuddyAllocator
()
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -449,7 +453,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
}
size_t
Usage
::
operator
()(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
Used
(
gpu
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
@@ -458,7 +462,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
}
size_t
Usage
::
operator
()(
const
platform
::
CUDAPinnedPlace
&
cuda_pinned
)
const
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
Used
(
cuda_pinned
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
浏览文件 @
69875dc4
...
...
@@ -41,7 +41,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
alloc
.
Release
(
platform
::
CPUPlace
());
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
NaiveBestFitAllocatorTest
,
GpuAlloc
)
{
NaiveBestFitAllocator
alloc
{
platform
::
CUDAPlace
(
0
)};
{
...
...
paddle/fluid/memory/allocation/pinned_allocator.cc
浏览文件 @
69875dc4
...
...
@@ -19,12 +19,20 @@ namespace memory {
namespace
allocation
{
bool
CPUPinnedAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
void
CPUPinnedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipHostFree
(
allocation
->
ptr
()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaFreeHost
(
allocation
->
ptr
()));
#endif
delete
allocation
;
}
Allocation
*
CPUPinnedAllocator
::
AllocateImpl
(
size_t
size
)
{
void
*
ptr
;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipHostMalloc
(
&
ptr
,
size
,
hipHostMallocPortable
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaHostAlloc
(
&
ptr
,
size
,
cudaHostAllocPortable
));
#endif
return
new
Allocation
(
ptr
,
size
,
platform
::
CUDAPinnedPlace
());
}
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
69875dc4
...
...
@@ -26,7 +26,7 @@
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif
...
...
@@ -127,7 +127,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
}
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
platform
::
CUDAPlace
p
(
0
);
RetryAllocator
allocator
(
std
::
make_shared
<
CUDAAllocator
>
(
p
),
retry_ms
);
...
...
paddle/fluid/memory/detail/CMakeLists.txt
浏览文件 @
69875dc4
...
...
@@ -2,11 +2,13 @@ include(ExternalProject)
cc_library
(
memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place
)
if
(
${
WITH_GPU
}
)
if
(
WITH_GPU
)
nv_library
(
system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place
)
else
(
${
WITH_GPU
}
)
elseif
(
WITH_ROCM
)
hip_library
(
system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place
)
else
()
cc_library
(
system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place
)
endif
(
${
WITH_GPU
}
)
endif
()
cc_test
(
system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator
)
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
69875dc4
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "glog/logging.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
#endif
...
...
@@ -220,7 +220,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t
allocate_bytes
=
max_chunk_size_
;
size_t
index
=
0
;
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
system_allocator_
->
UseGpu
())
{
if
((
total_used_
+
total_free_
)
==
0
)
{
// Compute the allocation size for gpu for the first allocation.
...
...
paddle/fluid/memory/detail/buddy_allocator_test.cc
浏览文件 @
69875dc4
...
...
@@ -23,7 +23,7 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <fstream>
#include <string>
...
...
@@ -76,7 +76,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
return
nullptr
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
BuddyAllocator
,
GpuFraction
)
{
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
...
...
@@ -195,8 +195,13 @@ TEST(BuddyAllocator, AllocFromAvailable) {
// Take half of available GPU
void
*
p
;
#ifdef PADDLE_WITH_HIP
hipError_t
result
=
hipMalloc
(
&
p
,
available
>>
1
);
EXPECT_TRUE
(
result
==
hipSuccess
);
#else
cudaError_t
result
=
cudaMalloc
(
&
p
,
available
>>
1
);
EXPECT_TRUE
(
result
==
cudaSuccess
);
#endif
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator
buddy_allocator
(
...
...
@@ -209,7 +214,11 @@ TEST(BuddyAllocator, AllocFromAvailable) {
TestBuddyAllocator
(
&
buddy_allocator
,
static_cast
<
size_t
>
(
1
<<
30
));
if
(
p
)
{
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE
(
hipFree
(
p
)
==
hipSuccess
);
#else
EXPECT_TRUE
(
cudaFree
(
p
)
==
cudaSuccess
);
#endif
}
}
...
...
@@ -219,7 +228,12 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
void
*
p
=
nullptr
;
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE
(
hipMalloc
(
&
p
,
static_cast
<
size_t
>
(
1
)
<<
30
)
==
hipSuccess
);
#else
EXPECT_TRUE
(
cudaMalloc
(
&
p
,
static_cast
<
size_t
>
(
1
)
<<
30
)
==
cudaSuccess
);
#endif
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator
buddy_allocator
(
...
...
@@ -230,7 +244,11 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
TestBuddyAllocator
(
&
buddy_allocator
,
static_cast
<
size_t
>
(
1
)
<<
30
);
if
(
p
)
{
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE
(
hipFree
(
p
)
==
hipSuccess
);
#else
EXPECT_TRUE
(
cudaFree
(
p
)
==
cudaSuccess
);
#endif
}
}
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
69875dc4
...
...
@@ -35,7 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
@@ -111,7 +111,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
bool
CPUAllocator
::
UseGpu
()
const
{
return
false
;
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void
*
GPUAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
// CUDA documentation doesn't explain if cudaMalloc returns nullptr
...
...
@@ -121,7 +121,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
void
*
p
;
auto
result
=
platform
::
RecordedCudaMalloc
(
&
p
,
size
,
gpu_id_
);
if
(
result
==
cuda
Success
)
{
if
(
result
==
gpu
Success
)
{
*
index
=
0
;
gpu_alloc_size_
+=
size
;
return
p
;
...
...
@@ -193,10 +193,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
}
void
*
p
;
// PINNED memory is visible to all CUDA contexts.
// PINNED memory is visible to all CUDA contexts.
#ifdef PADDLE_WITH_HIP
hipError_t
result
=
hipHostMalloc
(
&
p
,
size
);
#else
cudaError_t
result
=
cudaHostAlloc
(
&
p
,
size
,
cudaHostAllocPortable
);
#endif
if
(
result
==
cuda
Success
)
{
if
(
result
==
gpu
Success
)
{
*
index
=
1
;
// PINNED memory
cuda_pinnd_alloc_size_
+=
size
;
return
p
;
...
...
@@ -209,7 +213,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
}
void
CUDAPinnedAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
cuda
Error_t
err
;
gpu
Error_t
err
;
PADDLE_ENFORCE_EQ
(
index
,
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1, but got %d"
,
index
));
...
...
@@ -219,6 +223,15 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"allocated cuda pinned memory (%d)"
,
size
,
cuda_pinnd_alloc_size_
));
cuda_pinnd_alloc_size_
-=
size
;
#ifdef PADDLE_WITH_HIP
err
=
hipHostFree
(
p
);
if
(
err
!=
hipErrorDeinitialized
)
{
PADDLE_ENFORCE_EQ
(
err
,
hipSuccess
,
platform
::
errors
::
Fatal
(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d"
,
err
));
}
#else
err
=
cudaFreeHost
(
p
);
// Purposefully allow cudaErrorCudartUnloading, because
...
...
@@ -233,6 +246,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d"
,
err
));
}
#endif
}
bool
CUDAPinnedAllocator
::
UseGpu
()
const
{
return
false
;
}
...
...
paddle/fluid/memory/detail/system_allocator.h
浏览文件 @
69875dc4
...
...
@@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator {
virtual
bool
UseGpu
()
const
;
};
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
GPUAllocator
:
public
SystemAllocator
{
public:
explicit
GPUAllocator
(
int
gpu_id
)
:
gpu_id_
(
gpu_id
)
{}
...
...
paddle/fluid/memory/detail/system_allocator_test.cc
浏览文件 @
69875dc4
...
...
@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
TestAllocator
(
&
a
,
0
);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
GPUAllocator
,
Alloc
)
{
paddle
::
memory
::
detail
::
GPUAllocator
a
(
0
);
TestAllocator
(
&
a
,
2048
);
...
...
@@ -77,7 +77,11 @@ TEST(GPUAllocator, AllocFailure) {
allocator
.
Alloc
(
&
index
,
alloc_size
);
ASSERT_TRUE
(
false
);
}
catch
(
paddle
::
memory
::
allocation
::
BadAlloc
&
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipGetLastError
());
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaGetLastError
());
#endif
}
}
#endif
paddle/fluid/memory/memcpy.cc
浏览文件 @
69875dc4
...
...
@@ -222,7 +222,7 @@ inline void SyncCUDAStream() {
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
)
{
const
void
*
src
,
size_t
num
,
gpu
Stream_t
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetDeviceId
(
src_place
.
device
);
...
...
@@ -244,7 +244,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template
<
>
void
Copy
<
platform
::
CUDAPlace
,
platform
::
CPUPlace
>
(
platform
::
CUDAPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
)
{
const
void
*
src
,
size_t
num
,
gpu
Stream_t
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetDeviceId
(
dst_place
.
device
);
...
...
@@ -266,7 +266,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template
<
>
void
Copy
<
platform
::
CUDAPlace
,
platform
::
CUDAPlace
>
(
platform
::
CUDAPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
)
{
const
void
*
src
,
size_t
num
,
gpu
Stream_t
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
...
...
@@ -327,7 +327,7 @@ template <>
void
Copy
<
platform
::
CUDAPinnedPlace
,
platform
::
CUDAPlace
>
(
platform
::
CUDAPinnedPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
)
{
gpu
Stream_t
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetDeviceId
(
src_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
...
...
@@ -345,7 +345,7 @@ template <>
void
Copy
<
platform
::
CUDAPlace
,
platform
::
CUDAPinnedPlace
>
(
platform
::
CUDAPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
)
{
gpu
Stream_t
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetDeviceId
(
dst_place
.
device
);
...
...
paddle/fluid/memory/memcpy.h
浏览文件 @
69875dc4
...
...
@@ -33,7 +33,7 @@ namespace memory {
template
<
typename
DstPlace
,
typename
SrcPlace
>
void
Copy
(
DstPlace
,
void
*
dst
,
SrcPlace
,
const
void
*
src
,
size_t
num
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* \brief Copy memory from one place to another place.
...
...
@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
*/
template
<
typename
DstPlace
,
typename
SrcPlace
>
void
Copy
(
DstPlace
,
void
*
dst
,
SrcPlace
,
const
void
*
src
,
size_t
num
,
cuda
Stream_t
stream
);
gpu
Stream_t
stream
);
#endif
}
// namespace memory
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录