Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ba8414d3
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ba8414d3
编写于
1月 07, 2020
作者:
C
Chen Weihang
提交者:
GitHub
1月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
replace CUDNN_ENFORCE with PADDLE_ENFORCE_CUDA_SUCCESS, test=develop (#22109)
上级
cce9af0e
变更
15
显示空白变更内容
内联
并排
Showing
15 changed file
with
510 addition
and
421 deletion
+510
-421
paddle/fluid/operators/batch_norm_op.cu
paddle/fluid/operators/batch_norm_op.cu
+100
-91
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+55
-42
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+58
-48
paddle/fluid/operators/conv_transpose_cudnn_op.cu
paddle/fluid/operators/conv_transpose_cudnn_op.cu
+42
-33
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+4
-4
paddle/fluid/operators/cudnn_rnn_cache.h
paddle/fluid/operators/cudnn_rnn_cache.h
+81
-58
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+23
-19
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+55
-45
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
.../operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+11
-7
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+8
-7
paddle/fluid/operators/instance_norm_op.cu
paddle/fluid/operators/instance_norm_op.cu
+46
-37
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+2
-2
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+2
-2
paddle/fluid/platform/cudnn_desc.h
paddle/fluid/platform/cudnn_desc.h
+22
-17
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+1
-9
未找到文件。
paddle/fluid/operators/batch_norm_op.cu
浏览文件 @
ba8414d3
...
...
@@ -94,8 +94,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
...
...
@@ -124,12 +125,14 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
dims
=
{
N
,
C
,
H
,
W
,
D
};
strides
=
{
H
*
W
*
D
*
C
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
// Note: PERSISTENT not implemented for inference
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
is_test
?
CUDNN_BATCHNORM_SPATIAL
:
mode_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
is_test
?
CUDNN_BATCHNORM_SPATIAL
:
mode_
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
...
...
@@ -149,7 +152,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_EQ
(
est_mean
->
dims
()[
0
],
C
);
PADDLE_ENFORCE_EQ
(
est_var
->
dims
()[
0
],
C
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationForwardInference
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardInference
(
handle
,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
...
...
@@ -214,7 +218,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
"The argument ReserveSpace of batch_norm op is not found."
));
// --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize
(
/*handle=*/
handle
,
...
...
@@ -228,7 +232,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
/*sizeInBytes=*/
&
workspace_size
));
// -------------- cudnn batchnorm reserve space --------------
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize
(
/*handle=*/
handle
,
...
...
@@ -242,7 +246,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
ctx
.
GetPlace
(),
transformed_x
.
type
(),
reserve_space_size
);
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
transformed_x
.
type
(),
workspace_size
);
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTrainingEx
(
handle
,
mode_
,
CUDNN_BATCHNORM_OPS_BN
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
...
...
@@ -264,7 +268,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
}
#endif
if
(
!
called
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
...
...
@@ -292,8 +296,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
ctx
,
&
transformed_y
,
y
);
}
// clean when exit.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
));
}
};
...
...
@@ -516,9 +521,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
;
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
...
...
@@ -536,11 +541,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
mode_
=
CUDNN_BATCHNORM_SPATIAL
;
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
...
...
@@ -559,7 +565,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
Tensor
workspace_tensor
;
auto
reserve_space_size
=
reserve_space
->
memory_size
();
// --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE
(
platform
::
dynload
::
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetBatchNormalizationBackwardExWorkspaceSize
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
...
...
@@ -576,7 +583,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
transformed_x
.
type
(),
workspace_size
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackwardEx
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackwardEx
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
/*bnOps=*/
CUDNN_BATCHNORM_OPS_BN
,
...
...
@@ -616,7 +624,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
}
#endif
if
(
!
called
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
...
...
@@ -658,9 +667,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
}
// clean when exit.
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
));
}
else
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
...
...
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
ba8414d3
...
...
@@ -133,12 +133,14 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
...
...
@@ -148,10 +150,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
int
perf_count
;
int
best_algo_idx
=
0
;
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
new
perf_t
[
kNUM_CUDNN_FWD_ALGS
]);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
...
...
@@ -163,16 +166,19 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#endif
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
...
...
@@ -197,7 +203,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
...
...
@@ -223,9 +229,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
return
workspace_size
;
}
};
...
...
@@ -249,12 +256,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
...
...
@@ -265,7 +274,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
int
best_algo_idx
=
0
;
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
new
perf_t
[
kNUM_CUDNN_BWD_DATA_ALGS
]);
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
kNUM_CUDNN_BWD_DATA_ALGS
,
...
...
@@ -294,7 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
...
...
@@ -302,9 +311,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
workspace_size_limit
,
&
algo
));
}
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#endif
}
else
if
(
deterministic
)
{
...
...
@@ -330,7 +341,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
...
...
@@ -359,7 +370,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
algo
,
&
workspace_size
));
...
...
@@ -385,12 +396,14 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
args
.
cdesc
.
desc
(),
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
...
...
@@ -403,7 +416,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
best_algo_idx
=
0
;
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
new
perf_t
[
kNUM_CUDNN_BWD_FILTER_ALGS
]);
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
kNUM_CUDNN_BWD_FILTER_ALGS
,
...
...
@@ -418,7 +431,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -426,7 +439,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
workspace_size_limit
,
&
algo
));
}
#else
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
...
...
@@ -455,7 +468,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
...
...
@@ -483,7 +496,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
algo
,
&
workspace_size
));
...
...
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
ba8414d3
...
...
@@ -237,8 +237,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// cudnn 7 can support groups, no need to do it manually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
args
.
cdesc
.
desc
(),
groups
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
args
.
cdesc
.
desc
(),
groups
));
groups
=
1
;
#endif
args
.
idesc
.
set
(
transformed_input
,
layout_format
);
...
...
@@ -276,12 +277,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
args
.
idesc
.
desc
(),
input_data
+
i
*
group_offset_in
,
args
.
wdesc
.
desc
(),
filter_data
+
i
*
group_offset_filter
,
args
.
cdesc
.
desc
(),
algo
,
workspace_ptr
,
workspace_size
,
&
beta
,
args
.
odesc
.
desc
()
,
output_data
+
i
*
group_offset_out
));
filter_data
+
i
*
group_offset_filter
,
args
.
cdesc
.
desc
()
,
algo
,
workspace_ptr
,
workspace_size
,
&
beta
,
args
.
odesc
.
desc
(),
output_data
+
i
*
group_offset_out
));
},
workspace_size
);
}
...
...
@@ -596,12 +598,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
wdesc
.
desc
(),
filter_data
+
i
*
group_offset_filter
,
args1
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args1
.
cdesc
.
desc
()
,
data_algo
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args1
.
idesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args1
.
cdesc
.
desc
(),
data_algo
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
+
i
*
group_offset_in
));
},
workspace_size
);
...
...
@@ -639,12 +642,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
input_data
+
i
*
group_offset_in
,
args2
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args2
.
cdesc
.
desc
()
,
filter_algo
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args2
.
wdesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args2
.
cdesc
.
desc
(),
filter_algo
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
i
*
group_offset_filter
));
},
workspace_size
);
...
...
@@ -993,11 +997,13 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args1
.
wdesc
.
desc
(),
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
workspace_ptr
,
workspace_size
,
&
beta
,
args1
.
odesc
.
desc
(),
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
workspace_ptr
,
workspace_size
,
&
beta
,
args1
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
...
...
@@ -1007,11 +1013,13 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
+
i
*
group_offset_in
,
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
workspace_ptr
,
workspace_size
,
&
alpha
,
args2
.
odesc
.
desc
(),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
+
i
*
group_offset_in
,
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
workspace_ptr
,
workspace_size
,
&
alpha
,
args2
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
...
...
@@ -1028,9 +1036,10 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
args3
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args3
.
odesc
.
desc
(),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
args3
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args3
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args3
.
cdesc
.
desc
(),
filter_algo
,
workspace_ptr
,
workspace_size
,
&
beta
,
args3
.
wdesc
.
desc
(),
...
...
@@ -1045,12 +1054,13 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
args4
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args4
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args4
.
cdesc
.
desc
(),
data_algo
,
workspace_ptr
,
workspace_size
,
&
beta
,
args4
.
idesc
.
desc
(),
args4
.
cdesc
.
desc
(),
data_algo
,
workspace_ptr
,
workspace_size
,
&
beta
,
args4
.
idesc
.
desc
(),
transformed_dx
+
i
*
group_offset_in
));
},
workspace_size
);
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu
浏览文件 @
ba8414d3
...
...
@@ -236,11 +236,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
// Get the algorithm
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_output_desc
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
if
(
algo
==
0
&&
FLAGS_cudnn_deterministic
)
{
...
...
@@ -248,7 +250,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
}
// get workspace size able to allocate
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
...
...
@@ -263,11 +265,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
transformed_output_data
+
output_offset
*
g
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
transformed_output_data
+
output_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
...
...
@@ -466,11 +471,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
// choose backward algorithm for data
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_output_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
data_algo
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_output_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
fwd_ws_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
fwd_ws_size
);
...
...
@@ -478,7 +485,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
// choose backward algorithm for filter
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_output_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
...
...
@@ -486,7 +493,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
filter_algo
));
// get workspace for backwards filter algorithm
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_output_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
bwd_filter_ws_size
));
...
...
@@ -507,12 +514,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
...
...
@@ -543,7 +551,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
ba8414d3
...
...
@@ -91,7 +91,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
if
(
is_test
)
{
// for inference
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardInference
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardInference
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
...
...
@@ -101,7 +101,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
cudnn_rnn_cache
->
workspace_size_
));
}
else
{
// for train
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
...
...
@@ -230,7 +230,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
auto
run_seq_len
=
input_dims
[
0
];
PADDLE_ENFORCE_LE
((
size_t
)
run_seq_len
,
cudnn_rnn_cache
->
max_length_
,
"cudnn running seq_len CAN not greater max_lengh"
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardData
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardData
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
dy_desc_
,
out_grad_data
,
cudnn_rnn_cache
->
dhy_desc_
,
last_h_grad_data
,
...
...
@@ -242,7 +242,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
cudnn_rnn_cache
->
workspace_size_
,
reserve_data
,
cudnn_rnn_cache
->
reserve_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
input
->
data
<
T
>
(),
cudnn_rnn_cache
->
hx_desc_
,
init_h
->
data
<
T
>
(),
cudnn_rnn_cache
->
y_desc_
,
out
->
data
<
T
>
(),
...
...
paddle/fluid/operators/cudnn_rnn_cache.h
浏览文件 @
ba8414d3
...
...
@@ -92,13 +92,13 @@ struct CudnnRNNCache {
int
stride_a
[
3
];
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dy_desc_
[
i
]));
dim_a
[
0
]
=
batch_size_
;
dim_a
[
1
]
=
input_size_
;
...
...
@@ -107,9 +107,9 @@ struct CudnnRNNCache {
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
x_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dx_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
dim_a
[
0
]
=
batch_size_
;
...
...
@@ -120,9 +120,9 @@ struct CudnnRNNCache {
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
y_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dy_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
}
...
...
@@ -134,63 +134,74 @@ struct CudnnRNNCache {
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateDropoutDescriptor
(
&
dropout_desc_
));
size_t
state_size
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDropoutGetStatesSize
(
handle
,
&
state_size
);
dropout_state_
.
Resize
({
static_cast
<
int64_t
>
(
state_size
)})
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDropoutGetStatesSize
(
handle
,
&
state_size
)
)
;
dropout_state_
.
Resize
({
static_cast
<
int64_t
>
(
state_size
)}
);
auto
*
dropout_state_data
=
dropout_state_
.
mutable_data
<
uint8_t
>
(
place
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetDropoutDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetDropoutDescriptor
(
dropout_desc_
,
handle
,
dropout_prob_
,
dropout_state_data
,
state_size
,
seed_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateRNNDescriptor
(
&
rnn_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateRNNDescriptor
(
&
rnn_desc_
));
#if CUDNN_VERSION >= 6000
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetRNNDescriptor_v6
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetRNNDescriptor_v6
(
handle
,
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
CUDNN_RNN_ALGO_STANDARD
,
CUDNN_DATA_FLOAT
));
#else
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetRNNDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetRNNDescriptor
(
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
CUDNN_DATA_FLOAT
));
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
dw_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
w_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
dw_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNParamsSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNParamsSize
(
handle
,
rnn_desc_
,
x_desc_
[
0
],
&
weights_size_
,
CUDNN_DATA_FLOAT
));
PADDLE_ENFORCE_EQ
(
weights_size_
,
sizeof
(
float
)
*
weight_numel
,
...
...
@@ -199,14 +210,15 @@ struct CudnnRNNCache {
dim_w
[
0
]
=
weights_size_
/
sizeof
(
float
);
dim_w
[
1
]
=
1
;
dim_w
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
w_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
dw_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNWorkspaceSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNWorkspaceSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
workspace_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNTrainingReserveSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetRNNTrainingReserveSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
reserve_size_
));
reserve_data_
.
Resize
({
static_cast
<
int64_t
>
(
reserve_size_
)});
...
...
@@ -218,13 +230,13 @@ struct CudnnRNNCache {
void
release
()
{
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dy_desc_
[
i
]));
}
...
...
@@ -233,21 +245,32 @@ struct CudnnRNNCache {
delete
[]
dx_desc_
;
delete
[]
dy_desc_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcy_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcx_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcy_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyDropoutDescriptor
(
dropout_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyRNNDescriptor
(
rnn_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyRNNDescriptor
(
rnn_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
dw_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
w_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
dw_desc_
));
}
};
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
ba8414d3
...
...
@@ -164,8 +164,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t
cudnn_conv_desc
=
conv_desc
.
descriptor
<
T
>
(
padding_common
,
strides
,
dilations
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
cudnn_conv_desc
,
groups
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
cudnn_conv_desc
,
groups
));
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_input
.
dims
()));
...
...
@@ -196,13 +197,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
auto
x_dims
=
framework
::
vectorize
(
transformed_input
.
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
if
(
!
exhaustive_search
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
...
...
@@ -213,7 +215,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
...
...
@@ -248,7 +250,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
...
...
@@ -262,13 +265,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnAddTensor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnAddTensor
(
handle
,
&
alpha
,
cudnn_bias_desc
,
bias_data
,
&
alpha
,
cudnn_output_desc
,
output_data
));
}
else
{
...
...
@@ -279,12 +282,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
ScalingParamType
<
T
>
alpha1
=
1.0
f
;
ScalingParamType
<
T
>
alpha2
=
residual
?
1.0
f
:
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
output_data
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
ba8414d3
...
...
@@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t
*
conv_desc
=
new
cudnnConvolutionDescriptor_t
[
4
];
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
filter_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bias_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateConvolutionDescriptor
(
&
conv_desc
[
i
]));
}
...
...
@@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
filter_dims
.
push_back
(
framework
::
vectorize
<
int
>
(
filters
[
i
]
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
filter_desc
[
i
],
cudnn_dtype
,
format
,
4
,
filter_dims
[
i
].
data
()));
bias_dims
.
push_back
({
1
,
filter_dims
[
i
][
0
],
1
,
1
});
bias_strides
.
push_back
({
filter_dims
[
i
][
0
],
1
,
1
,
1
});
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
bias_desc
[
i
],
cudnn_dtype
,
4
,
bias_dims
[
i
].
data
(),
bias_strides
[
i
].
data
()));
in_dims
.
push_back
({
n
,
filter_dims
[
i
][
1
],
h
,
w
});
...
...
@@ -140,22 +140,25 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
out_strides
.
push_back
({
oc
*
h
*
w
,
h
*
w
,
w
,
1
});
if
(
i
<
2
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
conv_desc
[
i
],
2
,
k0x0
.
data
(),
k1x1
.
data
(),
k1x1
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionNdDescriptor
(
conv_desc
[
i
],
2
,
k1x1
.
data
(),
k1x1
.
data
(),
k1x1
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
conv_desc
[
i
],
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
conv_desc
[
i
],
CUDNN_DEFAULT_MATH
));
}
in_dims
[
2
][
1
]
*=
2
;
in_strides
[
2
][
0
]
=
oc
*
h
*
w
;
out_strides
[
2
][
0
]
=
filter_dims
[
2
][
0
]
*
h
*
w
;
// this out is continuous.
in_strides
[
3
][
0
]
=
filter_dims
[
2
][
0
]
*
h
*
w
;
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
conv_desc
[
2
],
2
));
cudnnConvolutionFwdAlgo_t
algo
[
4
];
...
...
@@ -171,17 +174,19 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
}
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
[
i
],
cudnn_dtype
,
4
,
in_dims
[
i
].
data
(),
in_strides
[
i
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
[
i
],
cudnn_dtype
,
4
,
out_dims
[
i
].
data
(),
out_strides
[
i
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
[
i
]));
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
[
i
]));
size_t
tmp_size
=
0
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
algo
[
i
],
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
...
...
@@ -196,7 +201,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
// branch1: pool + 1x1 conv
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnPoolingForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
input_data
,
&
beta
,
pool_out_desc
,
temp_data
));
...
...
@@ -218,7 +223,8 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
auto
func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
...
...
@@ -232,31 +238,35 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t
x_desc
;
cudnnTensorDescriptor_t
y_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
x_desc
,
cudnn_dtype
,
4
,
out_dims
[
3
].
data
(),
out_strides
[
2
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
y_desc
,
cudnn_dtype
,
4
,
out_dims
[
3
].
data
(),
out_strides
[
3
].
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
x_desc
,
static_cast
<
const
void
*>
(
out_datas
[
2
]),
CudnnDataType
<
T
>::
kZero
(),
y_desc
,
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
)
*
h
*
w
)));
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
filter_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bias_desc
[
i
]));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyConvolutionDescriptor
(
conv_desc
[
i
]));
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc
));
}
};
#endif
...
...
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
浏览文件 @
ba8414d3
...
...
@@ -45,8 +45,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t
in_desc
;
cudnnTensorDescriptor_t
out_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
));
cudnnDataType_t
cudnn_dtype
=
CudnnDataType
<
T
>::
type
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
...
...
@@ -85,12 +87,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
dims_y
[
i
]
=
1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_x
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_y
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
in_desc
,
static_cast
<
const
void
*>
(
ins
[
k
]
->
data
<
T
>
()),
CudnnDataType
<
T
>::
kZero
(),
out_desc
,
static_cast
<
void
*>
(
odata
)));
...
...
@@ -101,8 +103,10 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
odata
+=
flat_shape
[
1
];
}
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
));
}
};
...
...
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
浏览文件 @
ba8414d3
...
...
@@ -59,7 +59,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
cudnnTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize
<
int
>
(
output
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfSamplerForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSpatialTfSamplerForward
(
handle
,
cudnn_st_desc
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_input_desc
,
input_data
,
grid_data
,
CudnnDataType
<
T
>::
kZero
(),
cudnn_output_desc
,
output_data
));
...
...
@@ -111,7 +111,8 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
output_grad_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize
<
int
>
(
output_grad
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfSamplerBackward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSpatialTfSamplerBackward
(
handle
,
cudnn_st_dest
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_input_desc
,
input_data
,
CudnnDataType
<
T
>::
kZero
(),
cudnn_input_grad_desc
,
input_grad_data
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_output_grad_desc
,
...
...
paddle/fluid/operators/instance_norm_op.cu
浏览文件 @
ba8414d3
...
...
@@ -94,8 +94,9 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
...
...
@@ -113,10 +114,11 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
...
...
@@ -152,19 +154,22 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
functor
(
dev_ctx
,
saved_mean
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
saved_variance
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
}
};
...
...
@@ -291,8 +296,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
...
...
@@ -301,10 +307,11 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
...
...
@@ -314,7 +321,8 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
const
auto
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
...
...
@@ -342,8 +350,9 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
d_bias_tmp
.
data
<
T
>
(),
d_bias
->
data
<
T
>
(),
N
,
C
);
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
}
};
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
ba8414d3
...
...
@@ -49,7 +49,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxForward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
...
...
@@ -80,7 +80,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
...
...
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
ba8414d3
...
...
@@ -156,7 +156,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
auto
handle
=
ctx
.
cuda_device_context
().
cudnn_handle
();
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnPoolingForward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
tranformed_input_data
,
&
beta
,
cudnn_output_desc
,
tranformed_output_data
));
...
...
@@ -312,7 +312,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
T
*
input_grad_data
=
transformed_input_grad
.
mutable_data
<
T
>
(
transformed_input_grad
.
dims
(),
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnPoolingBackward
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnPoolingBackward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_output_desc
,
output_data
,
cudnn_output_desc
,
output_grad_data
,
cudnn_input_desc
,
input_data
,
&
beta
,
cudnn_input_desc
,
input_grad_data
));
...
...
paddle/fluid/platform/cudnn_desc.h
浏览文件 @
ba8414d3
...
...
@@ -83,19 +83,21 @@ class ActivationDescriptor {
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
CUDNN_ENFORCE
(
dynload
::
cudnnDestroyActivationDescriptor
(
t
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyActivationDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ActivationDescriptor
()
{
T
*
raw_ptr
;
CUDNN_ENFORCE
(
dynload
::
cudnnCreateActivationDescriptor
(
&
raw_ptr
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateActivationDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
template
<
typename
T
>
void
set
(
cudnnActivationMode_t
mode
,
const
T
&
coef
)
{
CUDNN_ENFORCE
(
dynload
::
cudnnSetActivationDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetActivationDescriptor
(
desc_
.
get
(),
mode
,
CUDNN_NOT_PROPAGATE_NAN
,
static_cast
<
double
>
(
coef
)));
}
...
...
@@ -112,14 +114,14 @@ class TensorDescriptor {
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
CUDNN_ENFORCE
(
dynload
::
cudnnDestroyTensorDescriptor
(
t
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyTensorDescriptor
(
t
));
t
=
nullptr
;
}
}
};
TensorDescriptor
()
{
T
*
raw_ptr
;
CUDNN_ENFORCE
(
dynload
::
cudnnCreateTensorDescriptor
(
&
raw_ptr
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateTensorDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
T
*
desc
()
{
return
desc_
.
get
();
}
...
...
@@ -135,7 +137,7 @@ class TensorDescriptor {
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetTensorNdDescriptor
(
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
dims_with_group
.
size
(),
dims_with_group
.
data
(),
strides
.
data
()));
}
...
...
@@ -148,7 +150,7 @@ class TensorDescriptor {
}
else
{
transformed_dims
=
dims
;
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetTensorNdDescriptorEx
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetTensorNdDescriptorEx
(
desc_
.
get
(),
format
,
ToCudnnDataType
(
tensor
.
type
()),
transformed_dims
.
size
(),
transformed_dims
.
data
()));
}
...
...
@@ -163,14 +165,14 @@ class FilterDescriptor {
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
CUDNN_ENFORCE
(
dynload
::
cudnnDestroyFilterDescriptor
(
t
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFilterDescriptor
(
t
));
t
=
nullptr
;
}
}
};
FilterDescriptor
()
{
T
*
raw_ptr
;
CUDNN_ENFORCE
(
dynload
::
cudnnCreateFilterDescriptor
(
&
raw_ptr
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateFilterDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
T
*
desc
()
{
return
desc_
.
get
();
}
...
...
@@ -188,7 +190,7 @@ class FilterDescriptor {
if
(
groups
>
1
)
{
transformed_dims
[
1
]
=
transformed_dims
[
1
]
/
groups
;
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetFilterNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetFilterNdDescriptor
(
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
format
,
transformed_dims
.
size
(),
transformed_dims
.
data
()));
}
...
...
@@ -203,14 +205,16 @@ class ConvolutionDescriptor {
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
CUDNN_ENFORCE
(
dynload
::
cudnnDestroyConvolutionDescriptor
(
t
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyConvolutionDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ConvolutionDescriptor
()
{
T
*
raw_ptr
;
CUDNN_ENFORCE
(
dynload
::
cudnnCreateConvolutionDescriptor
(
&
raw_ptr
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateConvolutionDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
T
*
desc
()
{
return
desc_
.
get
();
}
...
...
@@ -222,18 +226,19 @@ class ConvolutionDescriptor {
cudnnDataType_t
compute_type
=
(
dtype
==
CUDNN_DATA_DOUBLE
)
?
CUDNN_DATA_DOUBLE
:
CUDNN_DATA_FLOAT
;
T
*
desc
=
desc_
.
get
();
CUDNN_ENFORCE
(
dynload
::
cudnnSetConvolutionNdDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetConvolutionNdDescriptor
(
desc
,
pads
.
size
(),
pads
.
data
(),
strides
.
data
(),
dilations
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
#if CUDNN_VERSION_MIN(7, 0, 1)
CUDNN_ENFORCE
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
desc
,
groups
));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
desc
,
CUDNN_DEFAULT_MATH
));
if
(
dtype
==
CUDNN_DATA_HALF
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
desc
,
CUDNN_TENSOR_OP_MATH
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
desc
,
CUDNN_TENSOR_OP_MATH
));
}
#endif
#endif
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
ba8414d3
...
...
@@ -60,14 +60,6 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
#define CUDNN_ENFORCE(condition) \
do { \
auto status = condition; \
if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \
} while (false)
enum
class
DataLayout
{
// Not use
kNHWC
,
kNCHW
,
...
...
@@ -467,7 +459,7 @@ class ScopedActivationDescriptor {
PADDLE_THROW
(
"unrecognized activation mode: %d ."
,
static_cast
<
int
>
(
activation_mode
));
}
CUDNN_ENFORCE
(
dynload
::
cudnnSetActivationDescriptor
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnSetActivationDescriptor
(
desc_
,
mode
,
CUDNN_NOT_PROPAGATE_NAN
,
relu_ceiling
));
return
desc_
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录