Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
358bc06c
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
358bc06c
编写于
8月 05, 2020
作者:
Z
Zhaolong Xing
提交者:
GitHub
8月 05, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[CUDNN8 support] : support CUDNN8 (#25664)
* cunn8 support test=develop * fix ci error test=develop
上级
5970871a
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
49 addition
and
56 deletion
+49
-56
cmake/cudnn.cmake
cmake/cudnn.cmake
+11
-7
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+3
-38
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+9
-3
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+11
-4
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+15
-4
未找到文件。
cmake/cudnn.cmake
浏览文件 @
358bc06c
...
@@ -60,9 +60,8 @@ else()
...
@@ -60,9 +60,8 @@ else()
set
(
CUDNN_FOUND OFF
)
set
(
CUDNN_FOUND OFF
)
endif
()
endif
()
if
(
CUDNN_FOUND
)
macro
(
find_cudnn_version cudnn_header_file
)
file
(
READ
${
CUDNN_INCLUDE_DIR
}
/cudnn.h CUDNN_VERSION_FILE_CONTENTS
)
file
(
READ
${
cudnn_header_file
}
CUDNN_VERSION_FILE_CONTENTS
)
get_filename_component
(
CUDNN_LIB_PATH
${
CUDNN_LIBRARY
}
DIRECTORY
)
get_filename_component
(
CUDNN_LIB_PATH
${
CUDNN_LIBRARY
}
DIRECTORY
)
string
(
REGEX MATCH
"define CUDNN_VERSION +([0-9]+)"
string
(
REGEX MATCH
"define CUDNN_VERSION +([0-9]+)"
...
@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
...
@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
math
(
EXPR CUDNN_VERSION
math
(
EXPR CUDNN_VERSION
"
${
CUDNN_MAJOR_VERSION
}
* 1000 +
"
${
CUDNN_MAJOR_VERSION
}
* 1000 +
${
CUDNN_MINOR_VERSION
}
* 100 +
${
CUDNN_PATCHLEVEL_VERSION
}
"
)
${
CUDNN_MINOR_VERSION
}
* 100 +
${
CUDNN_PATCHLEVEL_VERSION
}
"
)
endif
()
message
(
STATUS
"Current cuDNN header is
${
cudnn_header_file
}
"
message
(
STATUS
"Current cuDNN header is
${
CUDNN_INCLUDE_DIR
}
/cudnn.h. "
"Current cuDNN version is v
${
CUDNN_MAJOR_VERSION
}
.
${
CUDNN_MINOR_VERSION
}
. "
)
"Current cuDNN version is v
${
CUDNN_MAJOR_VERSION
}
.
${
CUDNN_MINOR_VERSION
}
. "
)
endif
()
endif
()
endmacro
()
if
(
CUDNN_FOUND
)
find_cudnn_version
(
${
CUDNN_INCLUDE_DIR
}
/cudnn.h
)
if
(
NOT CUDNN_MAJOR_VERSION
)
find_cudnn_version
(
${
CUDNN_INCLUDE_DIR
}
/cudnn_version.h
)
endif
()
endif
()
endif
()
endif
()
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
358bc06c
...
@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
...
@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
workspace_size
>
workspace_size_limit
)
{
has_got_workspace_size
=
false
;
workspace_size_limit
=
workspace_size
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
}
#else
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
...
@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#endif
#endif
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
workspace_size
>
workspace_size_limit
)
{
workspace_size_limit
=
workspace_size
;
has_got_workspace_size
=
false
;
has_got_workspace_size
=
false
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
}
#else
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
...
@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
workspace_size
>
workspace_size_limit
)
{
has_got_workspace_size
=
false
;
workspace_size
=
workspace_size_limit
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
}
#else
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
358bc06c
...
@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto
x_dims
=
framework
::
vectorize
(
transformed_input
.
dims
());
auto
x_dims
=
framework
::
vectorize
(
transformed_input
.
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
if
(
!
exhaustive_search
)
{
if
(
!
exhaustive_search
)
{
int
perf_count
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
_v7
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
cudnn_output_desc
,
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
workspace_size_limit
,
&
algo
));
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
{
}
else
{
std
::
function
<
cudnnConvolutionFwdAlgo_t
()
>
search_func
=
std
::
function
<
cudnnConvolutionFwdAlgo_t
()
>
search_func
=
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
358bc06c
...
@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
...
@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
[
i
],
cudnn_dtype
,
4
,
out_dims
[
i
].
data
(),
out_desc
[
i
],
cudnn_dtype
,
4
,
out_dims
[
i
].
data
(),
out_strides
[
i
].
data
()));
out_strides
[
i
].
data
()));
int
perf_count
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
_v7
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
workspace_size_limit
,
&
algo
[
i
]))
;
algo
[
i
]
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
size_t
tmp_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
algo
[
i
],
&
tmp_size
));
algo
[
i
],
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
}
cudnnActivationDescriptor_t
cudnn_act_desc
=
cudnnActivationDescriptor_t
cudnn_act_desc
=
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
358bc06c
...
@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
...
@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
...
@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
...
@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
...
@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
...
@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 3000
#if CUDNN_VERSION >= 3000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
CUDNN_DNN_ROUTINE_EACH_AFTER_R3
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
#endif
// APIs available after R3:
#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
// APIs available after R4:
// APIs available after R4:
#if CUDNN_VERSION >= 4007
#if CUDNN_VERSION >= 4007
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
...
@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
...
@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
#endif
#if CUDNN_VERSION >= 8000
#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
CUDNN_DNN_ROUTINE_EACH_R8
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
}
// namespace dynload
}
// namespace dynload
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录