Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2002e71d
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2002e71d
编写于
10月 19, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix pinned allocator
上级
21fdf8e8
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
184 addition
and
70 deletion
+184
-70
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+1
-2
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+6
-4
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+75
-38
paddle/fluid/memory/allocation/allocator_facade_test.cc
paddle/fluid/memory/allocation/allocator_facade_test.cc
+39
-6
paddle/fluid/memory/allocation/auto_increment_allocator.h
paddle/fluid/memory/allocation/auto_increment_allocator.h
+1
-0
paddle/fluid/memory/allocation/locked_allocator.cc
paddle/fluid/memory/allocation/locked_allocator.cc
+1
-0
paddle/fluid/memory/allocation/locked_allocator.h
paddle/fluid/memory/allocation/locked_allocator.h
+1
-0
paddle/fluid/memory/allocation/pinned_allocator.cc
paddle/fluid/memory/allocation/pinned_allocator.cc
+3
-3
paddle/fluid/memory/allocation/pinned_allocator.h
paddle/fluid/memory/allocation/pinned_allocator.h
+1
-1
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+1
-6
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+27
-2
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+10
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+8
-1
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+2
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+1
-1
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+2
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+1
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+4
-4
未找到文件。
paddle/fluid/framework/tensor_util.cc
浏览文件 @
2002e71d
...
...
@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
(),
memory
::
Allocator
::
kCrossDevice
);
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
2002e71d
...
...
@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
cc_library
(
cpu_allocator SRCS cpu_allocator.cc DEPS allocator
)
cc_library
(
best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator
)
cc_library
(
locked_allocator SRCS locked_allocator.cc DEPS allocator
)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
if
(
WITH_GPU
)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
endif
()
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
...
...
@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
cc_test
(
naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator
)
nv_library
(
pinned_allocator SRCS pinned_allocator.cc DEPS allocator
)
if
(
WITH_GPU
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator
cuda_device_guard
)
else
()
set
(
AllocatorFacadeDeps
)
endif
()
...
...
@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
cuda_device_guard
)
retry_allocator
)
nv_test
(
allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
2002e71d
...
...
@@ -25,17 +25,18 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
DEFINE_int
32
(
DEFINE_int
64
(
gpu_allocator_retry_time
,
0
,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0"
);
...
...
@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
public:
CPUManagedAllocator
()
:
normal_allocator_
(
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUAllocator
()))),
communication_allocator_
(
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUPinnedAllocator
())))
{}
std
::
unique_ptr
<
Allocator
>
(
new
CPUAllocator
())))
{}
std
::
unique_ptr
<
Allocation
>
Allocate
(
size_t
size
,
Attr
attr
)
override
{
if
(
attr
==
kCrossDevice
)
{
return
communication_allocator_
->
Allocate
(
size
,
attr
);
}
else
{
return
normal_allocator_
->
Allocate
(
size
,
attr
);
}
return
normal_allocator_
->
Allocate
(
size
,
attr
);
}
std
::
shared_ptr
<
Allocation
>
AllocateShared
(
size_t
size
,
Attr
attr
)
override
{
if
(
attr
==
kCrossDevice
)
{
return
communication_allocator_
->
AllocateShared
(
size
,
attr
);
}
else
{
return
normal_allocator_
->
AllocateShared
(
size
,
attr
);
}
return
normal_allocator_
->
AllocateShared
(
size
,
attr
);
}
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
private:
std
::
shared_ptr
<
ManagedAllocator
>
normal_allocator_
;
std
::
shared_ptr
<
ManagedAllocator
>
communication_allocator_
;
};
#ifdef PADDLE_WITH_CUDA
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class
C
UDA
ManagedAllocator
:
public
ManagedAllocator
{
class
C
hunked
ManagedAllocator
:
public
ManagedAllocator
{
public:
explicit
CUDAManagedAllocator
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
max_chunk_size_
=
platform
::
GpuMaxChunkSize
();
raw_allocator_
=
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
))));
explicit
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
system_allocator
,
size_t
max_chunk_size
,
size_t
capacity
=
1
,
int64_t
retry_time
=
-
1
)
:
max_chunk_size_
(
max_chunk_size
),
retry_time_
(
retry_time
)
{
raw_allocator_
=
NaiveManagedAllocator
::
Create
(
std
::
move
(
system_allocator
));
if
(
max_chunk_size_
==
0
)
{
default_allocator_
=
raw_allocator_
;
}
else
{
size_t
available
,
total
;
platform
::
GpuMemoryUsage
(
&
available
,
&
total
);
size_t
capacity
=
available
/
max_chunk_size_
;
if
(
capacity
==
1
)
{
VLOG
(
10
)
<<
"Create BestFitAllocator with chunk_size "
<<
max_chunk_size_
;
...
...
@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
default_allocator_
.
reset
(
cond_allocator
);
}
~
C
UDA
ManagedAllocator
()
{
~
C
hunked
ManagedAllocator
()
{
// Specify destruct order.
default_allocator_
.
reset
();
chunks_
.
clear
();
...
...
@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
std
::
unique_ptr
<
Allocator
>
unmanaged_allocator
(
new
LockedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
BestFitAllocator
(
allocation
))));
if
(
FLAGS_gpu_allocator_retry_time
<=
0
)
{
if
(
retry_time_
<=
0
)
{
VLOG
(
10
)
<<
"Create NaiveManagedAllocator without retry"
;
return
std
::
make_shared
<
AlignedAllocator
<
64u
>>
(
NaiveManagedAllocator
::
Create
(
std
::
move
(
unmanaged_allocator
)));
}
else
{
VLOG
(
10
)
<<
"Create RetryAllocator with retry_time "
<<
FLAGS_gpu_allocator_retry_time
<<
"ms"
;
VLOG
(
10
)
<<
"Create RetryAllocator with retry_time "
<<
retry_time_
<<
"ms"
;
return
std
::
make_shared
<
AlignedAllocator
<
64u
>>
(
RetryAllocator
::
Create
(
std
::
move
(
unmanaged_allocator
),
static_cast
<
size_t
>
(
FLAGS_gpu_allocator_retry_time
)));
std
::
move
(
unmanaged_allocator
),
static_cast
<
size_t
>
(
retry_time_
)));
}
}
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
pr
ivate
:
pr
otected
:
size_t
max_chunk_size_
;
int64_t
retry_time_
;
std
::
vector
<
std
::
unique_ptr
<
Allocation
>>
chunks_
;
std
::
shared_ptr
<
ManagedAllocator
>
raw_allocator_
;
std
::
shared_ptr
<
ManagedAllocator
>
default_allocator_
;
};
#ifdef PADDLE_WITH_CUDA
class
CUDAManagedAllocator
:
public
ChunkedManagedAllocator
{
public:
explicit
CUDAManagedAllocator
(
int
dev_id
)
:
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
CUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
))),
GetMaxChunkSize
(
dev_id
),
GetCapcity
(
dev_id
),
GetRetryTime
())
{}
private:
static
size_t
GetMaxChunkSize
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
return
platform
::
GpuMaxChunkSize
();
}
static
size_t
GetCapcity
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
size_t
available
,
total
;
platform
::
GpuMemoryUsage
(
&
available
,
&
total
);
size_t
max_chunk_size
=
platform
::
GpuMaxChunkSize
();
return
max_chunk_size
==
0
?
0
:
available
/
max_chunk_size
;
}
static
int64_t
GetRetryTime
()
{
return
FLAGS_gpu_allocator_retry_time
;
}
};
class
CUDAPinnedManagedAllocator
:
public
ChunkedManagedAllocator
{
public:
CUDAPinnedManagedAllocator
()
:
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUPinnedAllocator
()),
platform
::
CUDAPinnedMaxChunkSize
(),
GetCapacity
(),
-
1
)
{
}
// never retry
private:
static
size_t
GetCapacity
()
{
size_t
total
=
platform
::
CpuTotalPhysicalMemory
();
size_t
max_chunk_size
=
platform
::
CUDAPinnedMaxChunkSize
();
return
max_chunk_size
==
0
?
0
:
total
/
max_chunk_size
;
}
};
#endif
class
AllocatorFacadePrivate
{
...
...
@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
AllocatorFacadePrivate
()
{
InitCPUAllocator
();
InitCUDAAllocator
();
InitCUDAPinnedAllocator
();
WrapZeroSizeAllocator
();
}
...
...
@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
void
InitCUDAAllocator
()
{
#ifdef PADDLE_WITH_CUDA
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetCUDADeviceCount
();
++
dev_id
)
{
int
device_count
=
platform
::
GetCUDADeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
allocators_
[
platform
::
CUDAPlace
(
dev_id
)]
=
std
::
make_shared
<
CUDAManagedAllocator
>
(
dev_id
);
}
#endif
}
void
InitCUDAPinnedAllocator
()
{
#ifdef PADDLE_WITH_CUDA
allocators_
[
platform
::
CUDAPinnedPlace
()]
=
std
::
make_shared
<
CUDAPinnedManagedAllocator
>
();
#endif
}
void
WrapZeroSizeAllocator
()
{
for
(
auto
&
pair
:
allocators_
)
{
pair
.
second
=
...
...
paddle/fluid/memory/allocation/allocator_facade_test.cc
浏览文件 @
2002e71d
...
...
@@ -16,37 +16,70 @@
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_int32
(
gpu_allocator_retry_time
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_int64
(
gpu_allocator_retry_time
);
#endif
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
TEST
(
allocator
,
allocator
)
{
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
FLAGS_gpu_allocator_retry_time
=
500
;
FLAGS_fraction_of_cuda_pinned_memory_to_use
=
0.5
;
#endif
auto
&
instance
=
AllocatorFacade
::
Instance
();
platform
::
Place
place
;
size_t
size
=
1024
;
{
auto
cpu_allocation
=
instance
.
Alloc
(
platform
::
CPUPlace
(),
1024
);
place
=
platform
::
CPUPlace
();
size
=
1024
;
auto
cpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
cpu_allocation
,
nullptr
);
ASSERT_NE
(
cpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
cpu_allocation
->
place
(),
place
);
ASSERT_EQ
(
cpu_allocation
->
size
(),
size
);
}
#ifdef PADDLE_WITH_CUDA
{
auto
gpu_allocation
=
instance
.
Alloc
(
platform
::
CUDAPlace
(
0
),
1024
);
place
=
platform
::
CUDAPlace
(
0
);
size
=
1024
;
auto
gpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
gpu_allocation
,
nullptr
);
ASSERT_NE
(
gpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
gpu_allocation
->
place
(),
place
);
ASSERT_GE
(
gpu_allocation
->
size
(),
size
);
}
{
// Allocate 2GB gpu memory
auto
gpu_allocation
=
instance
.
Alloc
(
platform
::
CUDAPlace
(
0
),
2
*
static_cast
<
size_t
>
(
1
<<
30
));
place
=
platform
::
CUDAPlace
(
0
);
size
=
2
*
static_cast
<
size_t
>
(
1
<<
30
);
auto
gpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
gpu_allocation
,
nullptr
);
ASSERT_NE
(
gpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
gpu_allocation
->
place
(),
place
);
ASSERT_GE
(
gpu_allocation
->
size
(),
size
);
}
{}
{
place
=
platform
::
CUDAPinnedPlace
();
size
=
(
1
<<
20
);
auto
cuda_pinned_allocation
=
instance
.
Alloc
(
platform
::
CUDAPinnedPlace
(),
1
<<
20
);
ASSERT_NE
(
cuda_pinned_allocation
,
nullptr
);
ASSERT_NE
(
cuda_pinned_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
cuda_pinned_allocation
->
place
(),
place
);
ASSERT_GE
(
cuda_pinned_allocation
->
size
(),
size
);
}
#endif
}
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/auto_increment_allocator.h
浏览文件 @
2002e71d
...
...
@@ -17,6 +17,7 @@
#include <atomic> // NOLINT
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
...
...
paddle/fluid/memory/allocation/locked_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
namespace
paddle
{
namespace
memory
{
...
...
paddle/fluid/memory/allocation/locked_allocator.h
浏览文件 @
2002e71d
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
...
...
paddle/fluid/memory/allocation/pinned_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -22,9 +22,9 @@ namespace allocation {
std
::
unique_ptr
<
Allocation
>
CPUPinnedAllocator
::
Allocate
(
size_t
size
,
Allocator
::
Attr
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
,
kCrossDevice
,
"CPUPinnedAllocator should be used for Cross-Device Communication"
);
//
PADDLE_ENFORCE_EQ(
//
attr, kCrossDevice,
//
"CPUPinnedAllocator should be used for Cross-Device Communication");
void
*
ptr
;
PADDLE_ENFORCE
(
cudaMallocHost
(
&
ptr
,
size
));
...
...
paddle/fluid/memory/allocation/pinned_allocator.h
浏览文件 @
2002e71d
...
...
@@ -23,7 +23,7 @@ namespace allocation {
class
CPUPinnedAllocation
:
public
Allocation
{
public:
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
:
Allocation
(
ptr
,
size
,
platform
::
C
PU
Place
())
{}
:
Allocation
(
ptr
,
size
,
platform
::
C
UDAPinned
Place
())
{}
};
class
CPUPinnedAllocator
:
public
UnmanagedAllocator
{
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -30,12 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
DECLARE_bool
(
use_pinned_memory
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
namespace
paddle
{
namespace
memory
{
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
2002e71d
...
...
@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
static
std
::
once_flag
init_flag
;
static
detail
::
BuddyAllocator
**
a_arr
=
nullptr
;
...
...
@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
platform
::
SetDeviceId
(
gpu_id
);
return
a_arr
[
gpu_id
];
}
#endif
template
<
>
size_t
Used
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
)
{
#ifdef PADDLE_WITH_CUDA
return
GetGPUBuddyAllocator
(
place
.
device
)
->
Used
();
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
*
Alloc
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
...
...
@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
cudaMemset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
Free
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
void
*
p
)
{
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetCUDAPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
...
...
@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
return
ba
;
}
#endif
template
<
>
size_t
Used
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_CUDA
return
GetCUDAPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
*
Alloc
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetCUDAPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
...
...
@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
memset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
Free
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
void
*
p
)
{
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator
()
->
Free
(
p
);
}
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
struct
AllocVisitor
:
public
boost
::
static_visitor
<
void
*>
{
inline
explicit
AllocVisitor
(
size_t
size
)
:
size_
(
size
)
{}
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
2002e71d
...
...
@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
}
#ifdef PADDLE_WITH_CUDA
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
...
...
@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
cudaStreamSynchronize
(
0
);
}
}
}
...
...
@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
cudaStreamSynchronize
(
0
);
}
}
}
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
2002e71d
...
...
@@ -56,10 +56,17 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
namespace
paddle
{
namespace
platform
{
inline
size_t
CpuTotalPhysicalMemory
()
{
size_t
CpuTotalPhysicalMemory
()
{
#ifdef __APPLE__
int
mib
[
2
];
mib
[
0
]
=
CTL_HW
;
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
2002e71d
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
2002e71d
...
...
@@ -13,11 +13,11 @@ limitations under the License. */
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
...
...
paddle/fluid/platform/init.cc
浏览文件 @
2002e71d
...
...
@@ -19,7 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
2002e71d
...
...
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
#ifdef PADDLE_WITH_CUDA
auto
*
src_ptr
=
static_cast
<
const
void
*>
(
tensor
.
data
<
CUR_TYPE
>
());
auto
*
dst_ptr
=
static_cast
<
void
*>
(
dst_tensor
.
mutable_data
<
CUR_TYPE
>
(
tensor
.
dims
(),
platform
::
CPUPlace
(),
memory
::
Allocator
::
kCrossDevice
));
tensor
.
dims
(),
platform
::
CPUPlace
()));
paddle
::
platform
::
GpuMemcpySync
(
dst_ptr
,
src_ptr
,
sizeof
(
CUR_TYPE
)
*
tensor
.
numel
(),
...
...
python/paddle/fluid/__init__.py
浏览文件 @
2002e71d
...
...
@@ -110,10 +110,10 @@ def __bootstrap__():
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
read_env_flags
=
[
'
check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope
'
,
'
use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
paddle_num_threads'
,
"dist_threadpool_size"
,
'cpu_deterministic'
,
'eager_delete_tensor_gb'
,
'use_legacy_allocator'
'
use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir
'
,
'
eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb
'
,
'
init_allocated_mem'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'
cpu_deterministic'
,
'
eager_delete_tensor_gb'
,
'use_legacy_allocator'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录