Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
84b0ecdc
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
84b0ecdc
编写于
1月 14, 2019
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into fuse/second_order_mul_sub
test=develop
上级
7035f051
e33427da
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
201 addition
and
99 deletion
+201
-99
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+1
-1
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+2
-2
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+4
-5
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+85
-64
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+1
-1
paddle/fluid/platform/temporary_allocator.cc
paddle/fluid/platform/temporary_allocator.cc
+49
-14
paddle/fluid/platform/temporary_allocator.h
paddle/fluid/platform/temporary_allocator.h
+5
-5
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+52
-6
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+2
-1
未找到文件。
paddle/fluid/framework/operator.h
浏览文件 @
84b0ecdc
...
...
@@ -391,7 +391,7 @@ class ExecutionContext {
PADDLE_ENFORCE
(
dynamic_cast
<
platform
::
TemporaryAllocation
*>
(
allocation_ptr
)
!=
nullptr
,
"The AllocationPtr must be TemporaryAllocation."
);
PADDLE_ENFORCE_
EQ
(
allocation_ptr
->
size
(),
PADDLE_ENFORCE_
GE
(
allocation_ptr
->
size
(),
framework
::
product
(
dim
)
*
sizeof
(
T
));
paddle
::
framework
::
Tensor
temp_tensor
(
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
84b0ecdc
...
...
@@ -100,14 +100,14 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if
(
NOT EXISTS
${
OCR_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
endif
()
inference_analysis_api_test_with_refer_result
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
inference_analysis_api_test_with_refer_result
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
SERIAL
)
# mobilenet with transpose op
set
(
MOBILENET_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet"
)
if
(
NOT EXISTS
${
MOBILENET_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
MOBILENET_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Fmobilenet.tar.gz"
)
endif
()
inference_analysis_api_test_with_refer_result
(
test_analyzer_mobilenet_transpose
${
MOBILENET_INSTALL_DIR
}
analyzer_vis_tester.cc
)
inference_analysis_api_test_with_refer_result
(
test_analyzer_mobilenet_transpose
${
MOBILENET_INSTALL_DIR
}
analyzer_vis_tester.cc
SERIAL
)
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
84b0ecdc
...
...
@@ -313,13 +313,12 @@ void CompareDeterministic(
int
num_times
=
FLAGS_repeat
;
auto
predictor
=
CreateTestPredictor
(
config
,
FLAGS_use_analysis
);
// warmup run
std
::
vector
<
PaddleTensor
>
warmup_outputs
,
outputs
;
predictor
->
Run
(
inputs
[
0
],
&
warmup_outputs
,
batch_size
);
// run num_times to Compare Deterministic Result.
for
(
int
i
=
0
;
i
<
num_times
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
inputs
.
size
();
j
++
)
{
for
(
size_t
j
=
0
;
j
<
inputs
.
size
();
j
++
)
{
// warmup run
predictor
->
Run
(
inputs
[
j
],
&
warmup_outputs
,
batch_size
);
for
(
int
i
=
0
;
i
<
num_times
;
i
++
)
{
predictor
->
Run
(
inputs
[
j
],
&
outputs
,
batch_size
);
CompareResult
(
outputs
,
warmup_outputs
);
}
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
84b0ecdc
...
...
@@ -137,7 +137,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t
algo
;
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
bool
half_float
=
false
;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
...
...
@@ -158,6 +157,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
...
...
@@ -180,21 +181,26 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
...
...
@@ -219,17 +225,23 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
// Allocate on GPU memory
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
i
*
group_offset_out
));
}
}
};
...
...
@@ -353,10 +365,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit
=
max_user_size
*
1024
*
1024
;
}
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
if
((
input_data
||
filter_data
)
&&
exhaustive_search
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
...
...
@@ -374,25 +396,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
data_algo
=
data_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
kNUM_CUDNN_BWD_DATA_ALGS
>
data_perf_stat
;
auto
cudnn_find_bd_data_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
handle
,
cudnn_filter_desc
,
filter_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_grad_data
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
data_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_data_func
,
workspace_size_limit
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
handle
,
cudnn_filter_desc
,
filter_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_grad_data
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
data_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
...
...
@@ -443,25 +462,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
kNUM_CUDNN_BWD_FILTER_ALGS
>
filter_perf_stat
;
auto
cudnn_find_bd_f_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_grad_data
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
filter_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_f_func
,
workspace_size_limit
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_grad_data
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
filter_perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
return
filter_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"cuDNN backward filter algo "
<<
filter_algo
;
...
...
@@ -482,6 +499,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
// ------------------- cudnn conv workspace ---------------------
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
// ------------------- cudnn conv backward data ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
if
(
input_grad
)
{
...
...
@@ -489,15 +516,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
i
*
group_offset_in
));
}
}
// ------------------- cudnn conv backward filter ---------------------
...
...
@@ -505,15 +529,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace_ptr
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
i
*
group_offset_filter
));
}
}
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
84b0ecdc
...
...
@@ -61,7 +61,7 @@ namespace platform {
* the allocations of temp_allocation_queue:
* - when the Stream calls cudaStreamSynchronize;
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_t
emporary
_allocation).
* (defined by FLAGS_limit_of_t
mp
_allocation).
*
* */
class
DeviceTemporaryAllocator
{
...
...
paddle/fluid/platform/temporary_allocator.cc
浏览文件 @
84b0ecdc
...
...
@@ -15,8 +15,15 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_double
(
limit_of_temporary_allocation
,
-
1
,
"The up limit of temporary_allocation size."
);
DEFINE_int64
(
limit_of_tmp_allocation
,
-
1
,
"The up limit of temporary_allocation size."
);
DEFINE_double
(
times_excess_than_required_tmp_allocation
,
2
,
"times_excess_than_required_tmp_allocation indicates the "
"max size the TemporaryAllocator can return. For example, "
"if the required memory size is N, and "
"times_excess_than_required_tmp_allocation is 2.0, "
"the TemporaryAllocator will return the available allocation "
"that the range of size is N ~ 2*N."
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -29,24 +36,25 @@ TemporaryAllocation::TemporaryAllocation(
underlying_allocation_
(
std
::
move
(
underlying_allocation
))
{}
TemporaryAllocator
::
TemporaryAllocator
(
platform
::
Place
place
)
:
place_
(
place
)
{
temp_mem_
queue_
.
reset
(
new
std
::
deque
<
TemporaryAllocation
*>
());
temp_mem_
map_
.
reset
(
new
std
::
multimap
<
size_t
,
TemporaryAllocation
*>
());
}
bool
TemporaryAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
void
TemporaryAllocator
::
Release
(
const
std
::
function
<
void
()
>
&
callback
)
{
std
::
shared_ptr
<
std
::
deque
<
TemporaryAllocation
*>>
t_allocations
;
std
::
unique_ptr
<
std
::
multimap
<
size_t
,
TemporaryAllocation
*>>
t_allocations
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
callback
();
t_allocations
=
temp_mem_queue_
;
temp_mem_
queue_
.
reset
(
new
std
::
deque
<
TemporaryAllocation
*>
());
t_allocations
.
swap
(
temp_mem_map_
)
;
temp_mem_
map_
.
reset
(
new
std
::
multimap
<
size_t
,
TemporaryAllocation
*>
());
wait_delete_mem_
=
0
;
}
for
(
auto
tmp
:
*
t_allocations
)
{
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
tmp
->
ptr
()
<<
" size: "
<<
tmp
->
size
();
delete
tmp
;
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
tmp
.
second
->
ptr
()
<<
" size: "
<<
tmp
.
second
->
size
();
delete
tmp
.
second
;
}
}
...
...
@@ -54,28 +62,34 @@ void TemporaryAllocator::Free(alloc::Allocation *allocation) {
auto
*
temp_allocation
=
dynamic_cast
<
TemporaryAllocation
*>
(
allocation
);
PADDLE_ENFORCE_NOT_NULL
(
temp_allocation
);
if
(
platform
::
is_gpu_place
(
temp_allocation
->
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_same_place
(
temp_allocation
->
place
(),
place_
),
"The place should be the same."
);
size_t
wait_delete_mem
=
0
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
temp_mem_
queue_
->
emplace_back
(
temp_allocation
);
temp_mem_
map_
->
emplace
(
temp_allocation
->
size
(),
temp_allocation
);
wait_delete_mem_
+=
temp_allocation
->
size
();
wait_delete_mem
=
wait_delete_mem_
;
VLOG
(
10
)
<<
"Move temporary allocation: "
<<
temp_allocation
->
ptr
()
<<
" to delete queue: "
<<
temp_allocation
->
size
()
<<
"; "
<<
"wait_delete_mem: "
<<
wait_delete_mem
_
;
<<
"wait_delete_mem: "
<<
wait_delete_mem
;
}
if
(
FLAGS_limit_of_temporary_allocation
>
0
&&
wait_delete_mem
>
FLAGS_limit_of_temporary_allocation
)
{
if
(
FLAGS_limit_of_tmp_allocation
>
0
&&
wait_delete_mem
>
static_cast
<
size_t
>
(
FLAGS_limit_of_tmp_allocation
))
{
PADDLE_ENFORCE
(
callback_
!=
nullptr
,
"The callback is non-initialized."
);
Release
(
callback_
);
}
return
;
}
VLOG
(
10
)
<<
"Delete temporary allocation "
<<
temp_allocation
->
ptr
()
<<
" size: "
<<
temp_allocation
->
size
();
delete
temp_allocation
;
}
size_t
TemporaryAllocator
::
TemporaryAllocationQueueSize
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
return
temp_mem_
queue_
?
temp_mem_queue
_
->
size
()
:
0
;
return
temp_mem_
map_
?
temp_mem_map
_
->
size
()
:
0
;
}
void
TemporaryAllocator
::
SetCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
...
...
@@ -84,6 +98,27 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
alloc
::
Allocation
*
TemporaryAllocator
::
AllocateImpl
(
size_t
size
,
alloc
::
Allocator
::
Attr
attr
)
{
{
// Find available allocation in temp_mem_map.
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
if
(
temp_mem_map_
->
size
())
{
auto
it
=
temp_mem_map_
->
lower_bound
(
size
);
// FIXME(zcd): Not sure the best value of excess fraction.
if
(
it
!=
temp_mem_map_
->
end
()
&&
it
->
first
<
static_cast
<
size_t
>
(
size
*
FLAGS_times_excess_than_required_tmp_allocation
))
{
auto
tmp_ptr
=
it
->
second
;
temp_mem_map_
->
erase
(
it
);
wait_delete_mem_
-=
tmp_ptr
->
size
();
VLOG
(
10
)
<<
"Reuse temporary allocation: "
<<
tmp_ptr
->
ptr
()
<<
": "
<<
tmp_ptr
->
size
();
return
tmp_ptr
;
}
}
}
// If not find the the available allocation, get allocation from
// AllocatorFacadeInstance.
auto
raw_allocation
=
alloc
::
AllocatorFacade
::
Instance
().
Alloc
(
place_
,
size
,
attr
);
auto
temp_mem
=
new
TemporaryAllocation
(
std
::
move
(
raw_allocation
));
...
...
paddle/fluid/platform/temporary_allocator.h
浏览文件 @
84b0ecdc
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <condition_variable> // NOLINT
#include <deque>
#include <map>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
...
...
@@ -39,7 +40,7 @@ class TemporaryAllocation : public memory::allocation::Allocation {
*
* There is one opportunity to free the allocations of temp_allocation_queue:
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_t
emporary
_allocation).
* (defined by FLAGS_limit_of_t
mp
_allocation).
*
* */
class
TemporaryAllocator
:
public
memory
::
allocation
::
Allocator
{
...
...
@@ -62,11 +63,10 @@ class TemporaryAllocator : public memory::allocation::Allocator {
private:
platform
::
Place
place_
;
// When the allocation is not held by any variable, it should be placed
// to temp_mem_
queue
immediately.
std
::
shared_ptr
<
std
::
deque
<
TemporaryAllocation
*>>
temp_mem_queue_
{
nullptr
};
// to temp_mem_
map
immediately.
std
::
unique_ptr
<
std
::
multimap
<
size_t
,
TemporaryAllocation
*>>
temp_mem_map_
{
nullptr
};
std
::
mutex
mtx_
;
size_t
wait_delete_mem_
{
0
};
std
::
function
<
void
()
>
callback_
;
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
84b0ecdc
...
...
@@ -18,7 +18,8 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
DECLARE_double
(
limit_of_temporary_allocation
);
DECLARE_int64
(
limit_of_tmp_allocation
);
DECLARE_double
(
times_excess_than_required_tmp_allocation
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -35,7 +36,7 @@ class DummyOp : public framework::OperatorBase {
const
platform
::
Place
&
place
)
const
override
{}
};
TEST
(
temporary_allocator
,
te
mporary_allocator
)
{
TEST
(
temporary_allocator
,
te
st_base_function
)
{
platform
::
CPUPlace
cpu_place
;
TemporaryAllocator
alloc
(
cpu_place
);
alloc
.
Allocate
(
100
);
...
...
@@ -59,10 +60,10 @@ TEST(temporary_allocator, temporary_allocator) {
#endif
}
TEST
(
temporary_allocator
,
add_callback
)
{
TEST
(
temporary_allocator
,
test_flags_function
)
{
#ifdef PADDLE_WITH_CUDA
const
double
limit
=
FLAGS_limit_of_temporary
_allocation
;
FLAGS_limit_of_t
emporary
_allocation
=
10
;
const
int64_t
limit
=
FLAGS_limit_of_tmp
_allocation
;
FLAGS_limit_of_t
mp
_allocation
=
10
;
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
...
...
@@ -78,7 +79,52 @@ TEST(temporary_allocator, add_callback) {
});
{
gpu_alloc
.
Allocate
(
100
);
}
PADDLE_ENFORCE
(
deleted
);
FLAGS_limit_of_temporary_allocation
=
limit
;
FLAGS_limit_of_tmp_allocation
=
limit
;
#endif
}
TEST
(
temporary_allocator
,
test_reuse_tmp_allocation
)
{
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
gpu_alloc
.
SetCallback
([]()
{});
void
*
tmp_allocation_ptr1
=
nullptr
;
{
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
auto
tmp_allocation1
=
gpu_alloc
.
Allocate
(
100
);
tmp_allocation_ptr1
=
tmp_allocation1
->
ptr
();
}
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
auto
tmp_allocation2
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr2
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr2
);
auto
tmp_allocation3
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr3
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr3
);
#endif
}
TEST
(
temporary_allocator
,
test_times_excess_than_required_tmp_allocation
)
{
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
gpu_alloc
.
SetCallback
([]()
{});
double
excess_fraction
=
FLAGS_times_excess_than_required_tmp_allocation
;
void
*
tmp_allocation_ptr1
=
nullptr
;
{
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
auto
tmp_allocation1
=
gpu_alloc
.
Allocate
(
static_cast
<
size_t
>
(
100
*
excess_fraction
-
1
));
tmp_allocation_ptr1
=
tmp_allocation1
->
ptr
();
}
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
auto
tmp_allocation2
=
gpu_alloc
.
Allocate
(
100
);
void
*
tmp_allocation_ptr2
=
tmp_allocation2
->
ptr
();
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
PADDLE_ENFORCE_EQ
(
tmp_allocation_ptr1
,
tmp_allocation_ptr2
);
#endif
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
84b0ecdc
...
...
@@ -155,7 +155,8 @@ def __bootstrap__():
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'sync_nccl_allreduce'
'sync_nccl_allreduce'
,
'limit_of_tmp_allocation'
,
'times_excess_than_required_tmp_allocation'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录