Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
SummerGao.
Paddle
提交
0a96ec69
P
Paddle
项目概览
SummerGao.
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
0a96ec69
编写于
6月 10, 2019
作者:
Z
Zeng Jinle
提交者:
GitHub
6月 10, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix conv v7 workspace size limit error, test=develop (#17902)
上级
4d5f6937
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
103 addition
and
35 deletion
+103
-35
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+103
-35
未找到文件。
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
0a96ec69
...
...
@@ -165,6 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// TODO(dangqingqing) simplify the following code by SearchAlgorithm in
// conv_cudnn_helper.h
bool
has_got_workspace_size
=
false
;
if
((
!
exhaustive_search
)
&&
(
!
half_float
))
{
#if CUDNN_VERSION >= 7001
using
perf_t
=
cudnnConvolutionFwdAlgoPerf_t
;
...
...
@@ -176,11 +177,29 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_output_desc
,
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
// NOTE(zjl): cudnnGetConvolutionForwardAlgorithm_v7 cannot limit
// workspace size. If the workspace size found by v7 exceeds the limit,
// we should fallback to non-v7 method to find another algorithm.
if
(
workspace_size_in_bytes
>
workspace_size_limit
)
{
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size_in_bytes
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#if CUDNN_VERSION >= 7001
}
else
{
has_got_workspace_size
=
true
;
}
#endif
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
...
...
@@ -219,10 +238,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
"cuDNN exhaustive search doesn't support half float."
);
}
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
if
(
!
has_got_workspace_size
)
{
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
}
// It is possible for float16 on Volta GPU to allocate more memory than
// the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
...
...
@@ -366,6 +388,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
auto
handle
=
dev_ctx
.
cudnn_handle
();
bool
has_got_bwd_data_ws_size
=
false
;
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
...
...
@@ -431,28 +455,49 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
))
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
// dyDesc: Handle to the previously initialized input
// differential
// tensor descriptor.
cudnn_output_grad_desc
,
cudnn_conv_desc
,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_input_desc
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
data_algo
));
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
tmp_size
));
auto
new_workspace_size
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
if
(
new_workspace_size
>
workspace_size_limit
)
{
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
new_workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
// dyDesc: Handle to the previously initialized input
// differential
// tensor descriptor.
cudnn_output_grad_desc
,
cudnn_conv_desc
,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_input_desc
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
data_algo
));
#if CUDNN_VERSION >= 7001
}
else
{
workspace_size_in_bytes
=
new_workspace_size
;
has_got_bwd_data_ws_size
=
true
;
}
#endif
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
if
(
!
has_got_bwd_data_ws_size
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
}
bool
has_got_bwd_filter_ws_size
=
false
;
if
(
filter_grad
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
...
...
@@ -495,22 +540,45 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
cudnn_filter_desc
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
perf_count
,
perf_results
.
get
()));
filter_algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilter
Algorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilter
WorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
filter_algo
));
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
tmp_size
));
auto
new_workspace_size
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
if
(
new_workspace_size
>
workspace_size_limit
)
{
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
new_workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
filter_algo
));
#if CUDNN_VERSION >= 7001
}
else
{
workspace_size_in_bytes
=
new_workspace_size
;
has_got_bwd_filter_ws_size
=
true
;
}
#endif
}
if
(
!
has_got_bwd_filter_ws_size
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
// ------------------- cudnn conv workspace ---------------------
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录