Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2002e71d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
2002e71d
编写于
10月 19, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix pinned allocator
上级
21fdf8e8
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
184 addition
and
70 deletion
+184
-70
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+1
-2
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+6
-4
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+75
-38
paddle/fluid/memory/allocation/allocator_facade_test.cc
paddle/fluid/memory/allocation/allocator_facade_test.cc
+39
-6
paddle/fluid/memory/allocation/auto_increment_allocator.h
paddle/fluid/memory/allocation/auto_increment_allocator.h
+1
-0
paddle/fluid/memory/allocation/locked_allocator.cc
paddle/fluid/memory/allocation/locked_allocator.cc
+1
-0
paddle/fluid/memory/allocation/locked_allocator.h
paddle/fluid/memory/allocation/locked_allocator.h
+1
-0
paddle/fluid/memory/allocation/pinned_allocator.cc
paddle/fluid/memory/allocation/pinned_allocator.cc
+3
-3
paddle/fluid/memory/allocation/pinned_allocator.h
paddle/fluid/memory/allocation/pinned_allocator.h
+1
-1
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+1
-6
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+27
-2
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+10
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+8
-1
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+2
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+1
-1
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+2
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+1
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+4
-4
未找到文件。
paddle/fluid/framework/tensor_util.cc
浏览文件 @
2002e71d
...
...
@@ -112,8 +112,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
dst
->
set_layout
(
src
.
layout
());
auto
src_place
=
src
.
place
();
auto
src_ptr
=
src
.
data
<
void
>
();
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
(),
memory
::
Allocator
::
kCrossDevice
);
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
2002e71d
...
...
@@ -2,7 +2,10 @@ cc_library(allocator SRCS allocator.cc DEPS place)
cc_library
(
cpu_allocator SRCS cpu_allocator.cc DEPS allocator
)
cc_library
(
best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator
)
cc_library
(
locked_allocator SRCS locked_allocator.cc DEPS allocator
)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
if
(
WITH_GPU
)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
endif
()
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
...
...
@@ -29,7 +32,7 @@ cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocato
cc_test
(
naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator
)
nv_library
(
pinned_allocator SRCS pinned_allocator.cc DEPS allocator
)
if
(
WITH_GPU
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator
)
set
(
AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator
cuda_device_guard
)
else
()
set
(
AllocatorFacadeDeps
)
endif
()
...
...
@@ -48,8 +51,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
cuda_device_guard
)
retry_allocator
)
nv_test
(
allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
2002e71d
...
...
@@ -25,17 +25,18 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
DEFINE_int
32
(
DEFINE_int
64
(
gpu_allocator_retry_time
,
0
,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0"
);
...
...
@@ -49,51 +50,34 @@ class CPUManagedAllocator : public ManagedAllocator {
public:
CPUManagedAllocator
()
:
normal_allocator_
(
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUAllocator
()))),
communication_allocator_
(
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUPinnedAllocator
())))
{}
std
::
unique_ptr
<
Allocator
>
(
new
CPUAllocator
())))
{}
std
::
unique_ptr
<
Allocation
>
Allocate
(
size_t
size
,
Attr
attr
)
override
{
if
(
attr
==
kCrossDevice
)
{
return
communication_allocator_
->
Allocate
(
size
,
attr
);
}
else
{
return
normal_allocator_
->
Allocate
(
size
,
attr
);
}
return
normal_allocator_
->
Allocate
(
size
,
attr
);
}
std
::
shared_ptr
<
Allocation
>
AllocateShared
(
size_t
size
,
Attr
attr
)
override
{
if
(
attr
==
kCrossDevice
)
{
return
communication_allocator_
->
AllocateShared
(
size
,
attr
);
}
else
{
return
normal_allocator_
->
AllocateShared
(
size
,
attr
);
}
return
normal_allocator_
->
AllocateShared
(
size
,
attr
);
}
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
private:
std
::
shared_ptr
<
ManagedAllocator
>
normal_allocator_
;
std
::
shared_ptr
<
ManagedAllocator
>
communication_allocator_
;
};
#ifdef PADDLE_WITH_CUDA
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class
C
UDA
ManagedAllocator
:
public
ManagedAllocator
{
class
C
hunked
ManagedAllocator
:
public
ManagedAllocator
{
public:
explicit
CUDAManagedAllocator
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
max_chunk_size_
=
platform
::
GpuMaxChunkSize
();
raw_allocator_
=
NaiveManagedAllocator
::
Create
(
std
::
unique_ptr
<
Allocator
>
(
new
CUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
))));
explicit
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
system_allocator
,
size_t
max_chunk_size
,
size_t
capacity
=
1
,
int64_t
retry_time
=
-
1
)
:
max_chunk_size_
(
max_chunk_size
),
retry_time_
(
retry_time
)
{
raw_allocator_
=
NaiveManagedAllocator
::
Create
(
std
::
move
(
system_allocator
));
if
(
max_chunk_size_
==
0
)
{
default_allocator_
=
raw_allocator_
;
}
else
{
size_t
available
,
total
;
platform
::
GpuMemoryUsage
(
&
available
,
&
total
);
size_t
capacity
=
available
/
max_chunk_size_
;
if
(
capacity
==
1
)
{
VLOG
(
10
)
<<
"Create BestFitAllocator with chunk_size "
<<
max_chunk_size_
;
...
...
@@ -119,7 +103,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
default_allocator_
.
reset
(
cond_allocator
);
}
~
C
UDA
ManagedAllocator
()
{
~
C
hunked
ManagedAllocator
()
{
// Specify destruct order.
default_allocator_
.
reset
();
chunks_
.
clear
();
...
...
@@ -140,27 +124,71 @@ class CUDAManagedAllocator : public ManagedAllocator {
std
::
unique_ptr
<
Allocator
>
unmanaged_allocator
(
new
LockedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
BestFitAllocator
(
allocation
))));
if
(
FLAGS_gpu_allocator_retry_time
<=
0
)
{
if
(
retry_time_
<=
0
)
{
VLOG
(
10
)
<<
"Create NaiveManagedAllocator without retry"
;
return
std
::
make_shared
<
AlignedAllocator
<
64u
>>
(
NaiveManagedAllocator
::
Create
(
std
::
move
(
unmanaged_allocator
)));
}
else
{
VLOG
(
10
)
<<
"Create RetryAllocator with retry_time "
<<
FLAGS_gpu_allocator_retry_time
<<
"ms"
;
VLOG
(
10
)
<<
"Create RetryAllocator with retry_time "
<<
retry_time_
<<
"ms"
;
return
std
::
make_shared
<
AlignedAllocator
<
64u
>>
(
RetryAllocator
::
Create
(
std
::
move
(
unmanaged_allocator
),
static_cast
<
size_t
>
(
FLAGS_gpu_allocator_retry_time
)));
std
::
move
(
unmanaged_allocator
),
static_cast
<
size_t
>
(
retry_time_
)));
}
}
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
pr
ivate
:
pr
otected
:
size_t
max_chunk_size_
;
int64_t
retry_time_
;
std
::
vector
<
std
::
unique_ptr
<
Allocation
>>
chunks_
;
std
::
shared_ptr
<
ManagedAllocator
>
raw_allocator_
;
std
::
shared_ptr
<
ManagedAllocator
>
default_allocator_
;
};
#ifdef PADDLE_WITH_CUDA
class
CUDAManagedAllocator
:
public
ChunkedManagedAllocator
{
public:
explicit
CUDAManagedAllocator
(
int
dev_id
)
:
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
CUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
))),
GetMaxChunkSize
(
dev_id
),
GetCapcity
(
dev_id
),
GetRetryTime
())
{}
private:
static
size_t
GetMaxChunkSize
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
return
platform
::
GpuMaxChunkSize
();
}
static
size_t
GetCapcity
(
int
dev_id
)
{
platform
::
CUDADeviceGuard
guard
(
dev_id
);
size_t
available
,
total
;
platform
::
GpuMemoryUsage
(
&
available
,
&
total
);
size_t
max_chunk_size
=
platform
::
GpuMaxChunkSize
();
return
max_chunk_size
==
0
?
0
:
available
/
max_chunk_size
;
}
static
int64_t
GetRetryTime
()
{
return
FLAGS_gpu_allocator_retry_time
;
}
};
class
CUDAPinnedManagedAllocator
:
public
ChunkedManagedAllocator
{
public:
CUDAPinnedManagedAllocator
()
:
ChunkedManagedAllocator
(
std
::
unique_ptr
<
Allocator
>
(
new
CPUPinnedAllocator
()),
platform
::
CUDAPinnedMaxChunkSize
(),
GetCapacity
(),
-
1
)
{
}
// never retry
private:
static
size_t
GetCapacity
()
{
size_t
total
=
platform
::
CpuTotalPhysicalMemory
();
size_t
max_chunk_size
=
platform
::
CUDAPinnedMaxChunkSize
();
return
max_chunk_size
==
0
?
0
:
total
/
max_chunk_size
;
}
};
#endif
class
AllocatorFacadePrivate
{
...
...
@@ -173,6 +201,7 @@ class AllocatorFacadePrivate {
AllocatorFacadePrivate
()
{
InitCPUAllocator
();
InitCUDAAllocator
();
InitCUDAPinnedAllocator
();
WrapZeroSizeAllocator
();
}
...
...
@@ -183,13 +212,21 @@ class AllocatorFacadePrivate {
void
InitCUDAAllocator
()
{
#ifdef PADDLE_WITH_CUDA
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetCUDADeviceCount
();
++
dev_id
)
{
int
device_count
=
platform
::
GetCUDADeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
allocators_
[
platform
::
CUDAPlace
(
dev_id
)]
=
std
::
make_shared
<
CUDAManagedAllocator
>
(
dev_id
);
}
#endif
}
void
InitCUDAPinnedAllocator
()
{
#ifdef PADDLE_WITH_CUDA
allocators_
[
platform
::
CUDAPinnedPlace
()]
=
std
::
make_shared
<
CUDAPinnedManagedAllocator
>
();
#endif
}
void
WrapZeroSizeAllocator
()
{
for
(
auto
&
pair
:
allocators_
)
{
pair
.
second
=
...
...
paddle/fluid/memory/allocation/allocator_facade_test.cc
浏览文件 @
2002e71d
...
...
@@ -16,37 +16,70 @@
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_int32
(
gpu_allocator_retry_time
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_int64
(
gpu_allocator_retry_time
);
#endif
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
TEST
(
allocator
,
allocator
)
{
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
FLAGS_gpu_allocator_retry_time
=
500
;
FLAGS_fraction_of_cuda_pinned_memory_to_use
=
0.5
;
#endif
auto
&
instance
=
AllocatorFacade
::
Instance
();
platform
::
Place
place
;
size_t
size
=
1024
;
{
auto
cpu_allocation
=
instance
.
Alloc
(
platform
::
CPUPlace
(),
1024
);
place
=
platform
::
CPUPlace
();
size
=
1024
;
auto
cpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
cpu_allocation
,
nullptr
);
ASSERT_NE
(
cpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
cpu_allocation
->
place
(),
place
);
ASSERT_EQ
(
cpu_allocation
->
size
(),
size
);
}
#ifdef PADDLE_WITH_CUDA
{
auto
gpu_allocation
=
instance
.
Alloc
(
platform
::
CUDAPlace
(
0
),
1024
);
place
=
platform
::
CUDAPlace
(
0
);
size
=
1024
;
auto
gpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
gpu_allocation
,
nullptr
);
ASSERT_NE
(
gpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
gpu_allocation
->
place
(),
place
);
ASSERT_GE
(
gpu_allocation
->
size
(),
size
);
}
{
// Allocate 2GB gpu memory
auto
gpu_allocation
=
instance
.
Alloc
(
platform
::
CUDAPlace
(
0
),
2
*
static_cast
<
size_t
>
(
1
<<
30
));
place
=
platform
::
CUDAPlace
(
0
);
size
=
2
*
static_cast
<
size_t
>
(
1
<<
30
);
auto
gpu_allocation
=
instance
.
Alloc
(
place
,
size
);
ASSERT_NE
(
gpu_allocation
,
nullptr
);
ASSERT_NE
(
gpu_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
gpu_allocation
->
place
(),
place
);
ASSERT_GE
(
gpu_allocation
->
size
(),
size
);
}
{}
{
place
=
platform
::
CUDAPinnedPlace
();
size
=
(
1
<<
20
);
auto
cuda_pinned_allocation
=
instance
.
Alloc
(
platform
::
CUDAPinnedPlace
(),
1
<<
20
);
ASSERT_NE
(
cuda_pinned_allocation
,
nullptr
);
ASSERT_NE
(
cuda_pinned_allocation
->
ptr
(),
nullptr
);
ASSERT_EQ
(
cuda_pinned_allocation
->
place
(),
place
);
ASSERT_GE
(
cuda_pinned_allocation
->
size
(),
size
);
}
#endif
}
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/auto_increment_allocator.h
浏览文件 @
2002e71d
...
...
@@ -17,6 +17,7 @@
#include <atomic> // NOLINT
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
...
...
paddle/fluid/memory/allocation/locked_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
namespace
paddle
{
namespace
memory
{
...
...
paddle/fluid/memory/allocation/locked_allocator.h
浏览文件 @
2002e71d
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
...
...
paddle/fluid/memory/allocation/pinned_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -22,9 +22,9 @@ namespace allocation {
std
::
unique_ptr
<
Allocation
>
CPUPinnedAllocator
::
Allocate
(
size_t
size
,
Allocator
::
Attr
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
,
kCrossDevice
,
"CPUPinnedAllocator should be used for Cross-Device Communication"
);
//
PADDLE_ENFORCE_EQ(
//
attr, kCrossDevice,
//
"CPUPinnedAllocator should be used for Cross-Device Communication");
void
*
ptr
;
PADDLE_ENFORCE
(
cudaMallocHost
(
&
ptr
,
size
));
...
...
paddle/fluid/memory/allocation/pinned_allocator.h
浏览文件 @
2002e71d
...
...
@@ -23,7 +23,7 @@ namespace allocation {
class
CPUPinnedAllocation
:
public
Allocation
{
public:
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
:
Allocation
(
ptr
,
size
,
platform
::
C
PU
Place
())
{}
:
Allocation
(
ptr
,
size
,
platform
::
C
UDAPinned
Place
())
{}
};
class
CPUPinnedAllocator
:
public
UnmanagedAllocator
{
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
2002e71d
...
...
@@ -30,12 +30,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
DECLARE_bool
(
use_pinned_memory
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
namespace
paddle
{
namespace
memory
{
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
2002e71d
...
...
@@ -98,7 +98,6 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace& place) {
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetGPUBuddyAllocator
(
int
gpu_id
)
{
static
std
::
once_flag
init_flag
;
static
detail
::
BuddyAllocator
**
a_arr
=
nullptr
;
...
...
@@ -128,15 +127,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
platform
::
SetDeviceId
(
gpu_id
);
return
a_arr
[
gpu_id
];
}
#endif
template
<
>
size_t
Used
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
)
{
#ifdef PADDLE_WITH_CUDA
return
GetGPUBuddyAllocator
(
place
.
device
)
->
Used
();
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
*
Alloc
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetGPUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
...
...
@@ -156,13 +161,21 @@ void* Alloc<platform::CUDAPlace>(const platform::CUDAPlace& place,
cudaMemset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
Free
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
void
*
p
)
{
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator
*
GetCUDAPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
...
...
@@ -176,15 +189,21 @@ BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
return
ba
;
}
#endif
template
<
>
size_t
Used
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_CUDA
return
GetCUDAPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
*
Alloc
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
buddy_allocator
=
GetCUDAPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
...
...
@@ -196,14 +215,20 @@ void* Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace& place,
memset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
template
<
>
void
Free
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
void
*
p
)
{
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator
()
->
Free
(
p
);
}
#else
PADDLE_THROW
(
"'CUDAPinnedPlace' is not supported in CPU only device."
);
#endif
}
struct
AllocVisitor
:
public
boost
::
static_visitor
<
void
*>
{
inline
explicit
AllocVisitor
(
size_t
size
)
:
size_
(
size
)
{}
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
2002e71d
...
...
@@ -27,6 +27,8 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
}
#ifdef PADDLE_WITH_CUDA
static
constexpr
size_t
kMaxGpuAsyncCopyBytes
=
64
*
1024
;
// 64K
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
CUDAPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
CUDAPlace
src_place
,
...
...
@@ -36,6 +38,10 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
,
stream
);
}
else
{
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyDeviceToHost
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
cudaStreamSynchronize
(
0
);
}
}
}
...
...
@@ -48,6 +54,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform
::
GpuMemcpyAsync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
,
stream
);
}
else
{
platform
::
GpuMemcpySync
(
dst
,
src
,
num
,
cudaMemcpyHostToDevice
);
// FIXME(zjl): do we really need it?
if
(
num
<=
kMaxGpuAsyncCopyBytes
)
{
cudaStreamSynchronize
(
0
);
}
}
}
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
2002e71d
...
...
@@ -56,10 +56,17 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
namespace
paddle
{
namespace
platform
{
inline
size_t
CpuTotalPhysicalMemory
()
{
size_t
CpuTotalPhysicalMemory
()
{
#ifdef __APPLE__
int
mib
[
2
];
mib
[
0
]
=
CTL_HW
;
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
2002e71d
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
2002e71d
...
...
@@ -13,11 +13,11 @@ limitations under the License. */
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
...
...
paddle/fluid/platform/init.cc
浏览文件 @
2002e71d
...
...
@@ -19,7 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
2002e71d
...
...
@@ -63,8 +63,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
#ifdef PADDLE_WITH_CUDA
auto
*
src_ptr
=
static_cast
<
const
void
*>
(
tensor
.
data
<
CUR_TYPE
>
());
auto
*
dst_ptr
=
static_cast
<
void
*>
(
dst_tensor
.
mutable_data
<
CUR_TYPE
>
(
tensor
.
dims
(),
platform
::
CPUPlace
(),
memory
::
Allocator
::
kCrossDevice
));
tensor
.
dims
(),
platform
::
CPUPlace
()));
paddle
::
platform
::
GpuMemcpySync
(
dst_ptr
,
src_ptr
,
sizeof
(
CUR_TYPE
)
*
tensor
.
numel
(),
...
...
python/paddle/fluid/__init__.py
浏览文件 @
2002e71d
...
...
@@ -110,10 +110,10 @@ def __bootstrap__():
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
read_env_flags
=
[
'
check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope
'
,
'
use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
paddle_num_threads'
,
"dist_threadpool_size"
,
'cpu_deterministic'
,
'eager_delete_tensor_gb'
,
'use_legacy_allocator'
'
use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir
'
,
'
eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb
'
,
'
init_allocated_mem'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'
cpu_deterministic'
,
'
eager_delete_tensor_gb'
,
'use_legacy_allocator'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录