Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3d5aa9d1
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3d5aa9d1
编写于
3月 12, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] fix conv2d and conv3d op, test=develop (#31553)
上级
f302bb4f
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
298 addition
and
249 deletion
+298
-249
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+114
-101
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+107
-124
paddle/fluid/operators/conv_transpose_cudnn_op.cu
paddle/fluid/operators/conv_transpose_cudnn_op.cu
+32
-8
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+15
-10
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+11
-4
python/paddle/fluid/tests/unittests/test_conv3d_op.py
python/paddle/fluid/tests/unittests/test_conv3d_op.py
+14
-0
python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
...n/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+5
-2
未找到文件。
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
3d5aa9d1
...
...
@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
args
.
handle
=
handle
;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
groups
);
#else
...
...
@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
args
.
cdesc
.
desc
(),
groups
));
groups
=
1
;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups
=
1
;
#endif
args
.
idesc
.
set
(
transformed_input
,
layout_format
);
args
.
wdesc
.
set
(
transformed_filter_channel
,
layout_format
,
groups
);
...
...
@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
algo
{};
using
search
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
cudnnConvolutionFwdAlgo_t
algo
{};
using
search
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
,
algo
);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
...
...
@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
data_algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
data_algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_algo
));
#endif
}
if
(
filter_grad
)
{
...
...
@@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
#endif
}
// ------------------- cudnn conv backward data ---------------------
...
...
@@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
VLOG
(
4
)
<<
"Conv_grad: use_addto = "
<<
ctx
.
Attr
<
bool
>
(
"use_addto"
);
if
(
input_grad
)
{
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
#ifdef PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args1
.
wdesc
.
desc
(),
filter_data
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
+
i
*
group_offset_in
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
odesc
.
desc
(),
output_grad_data
,
args1
.
wdesc
.
desc
(),
filter_data
,
args1
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
transformed_input_grad_data
+
i
*
group_offset_in
));
},
workspace_size
);
#endif
}
#endif
if
(
!
is_sys_pad
)
{
std
::
vector
<
int
>
starts
(
transformed_input_channel
.
dims
().
size
(),
0
);
std
::
vector
<
int
>
axes
(
transformed_input_channel
.
dims
().
size
(),
0
);
...
...
@@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
// ------------------- cudnn conv backward filter ---------------------
if
(
filter_grad
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
#ifdef PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args2
.
idesc
.
desc
(),
input_data
+
i
*
group_offset_in
,
args2
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
i
*
group_offset_filter
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
output_grad_data
,
args2
.
idesc
.
desc
(),
input_data
,
args2
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_grad_data
+
i
*
group_offset_filter
));
},
workspace_size
);
#endif
}
#endif
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
TransToChannelFirst
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
fwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
fwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_algo1
);
#endif
}
if
(
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_X
,
iwo_group
);
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
fwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
fwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_algo2
));
#endif
}
}
...
...
@@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dW
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_algo
));
#endif
}
if
(
ddW
&&
dX
)
{
...
...
@@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search4
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
data_algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
#endif
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
...
...
@@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if
(
ddO
)
{
if
(
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args1
.
wdesc
.
desc
(),
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
&
beta
,
args1
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
ddx
,
args1
.
wdesc
.
desc
(),
w
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
&
beta
,
args1
.
odesc
.
desc
(),
transformed_ddy_channel
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif
}
#endif
}
if
(
ddW
)
{
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
+
i
*
group_offset_in
,
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
&
beta
,
args2
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
,
args2
.
wdesc
.
desc
(),
ddw
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
&
beta
,
args2
.
odesc
.
desc
(),
transformed_ddy_channel
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif
}
#endif
}
if
(
channel_last
)
{
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T
*
transformed_dy_channel
=
transformed_dO_channel
.
data
<
T
>
();
if
(
dW
&&
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args3
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args3
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args3
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args3
.
wdesc
.
desc
(),
dw
+
i
*
group_offset_filter
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args3
.
odesc
.
desc
(),
transformed_dy_channel
,
args3
.
idesc
.
desc
(),
ddx
,
args3
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args3
.
wdesc
.
desc
(),
dw
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
dw
+
i
*
group_offset_filter
));
},
workspace_size
);
#endif
}
#endif
}
if
(
dX
&&
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args4
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args4
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args4
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args4
.
idesc
.
desc
(),
transformed_dx
+
i
*
group_offset_in
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args4
.
odesc
.
desc
(),
transformed_dy_channel
,
args4
.
wdesc
.
desc
(),
ddw
,
args4
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args4
.
idesc
.
desc
(),
transformed_dx
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_dx
+
i
*
group_offset_in
));
},
workspace_size
);
#endif
}
#endif
if
(
!
is_sys_pad
)
{
// reverse padded input
...
...
paddle/fluid/operators/conv_miopen_helper.h
浏览文件 @
3d5aa9d1
...
...
@@ -127,57 +127,52 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
bool
has_got_workspace_size
=
true
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
&
temp
=
ctx
.
cuda_device_context
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"FwdAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
fwd_algo
;
}
return
perf_stat
[
0
].
fwd_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
fwd_algo
;
}
else
{
auto
&
temp
=
ctx
.
cuda_device_context
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
fwd_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
...
...
@@ -194,58 +189,51 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdDataAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_data_algo
;
}
return
perf_stat
[
0
].
bwd_data_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
bwd_data_algo
;
}
else
{
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
bwd_data_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
...
...
@@ -262,56 +250,51 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdFilterAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_weights_algo
;
}
return
perf_stat
[
0
].
bwd_weights_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
bwd_weights_algo
;
}
else
{
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
bwd_weights_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu
浏览文件 @
3d5aa9d1
...
...
@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
algo
));
#endif
// ------------------- cudnn conv transpose forward ---------------------
int
input_offset
=
...
...
@@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_algo
));
#endif
}
if
(
filter_grad
)
{
...
...
@@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
#endif
}
// ------------------- cudnn conv backward data ---------------------
...
...
@@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
bwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_algo1
);
#endif
}
if
(
ddW
)
{
...
...
@@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
bwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_algo2
));
#endif
}
}
...
...
@@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_algo
));
#endif
}
if
(
ddW
&&
dX
)
{
...
...
@@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args4
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search4
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
#endif
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
...
...
paddle/fluid/platform/miopen_desc.h
浏览文件 @
3d5aa9d1
...
...
@@ -199,19 +199,24 @@ class FilterDescriptor {
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
// if (groups > 1) {
// transformed_dims[1] = transformed_dims[1] / groups;
// }
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSet4dTensorDescriptor
(
(
miopenTensorDescriptor_t
)
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
transformed_dims
[
0
],
transformed_dims
[
1
],
transformed_dims
[
2
],
transformed_dims
[
3
]));
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudnnDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent):
class
TestCUDNNCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNN"
)
TestCUDNNCase
.
__name__
=
cls_name
...
...
@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent):
class
TestCudnnChannelLastCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_data_format
(
self
):
self
.
data_format
=
"NHWC"
...
...
@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class
TestCUDNNPaddingSMAECase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
]
...
...
@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class
TestCUDNNPaddingVALIDCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
]
...
...
@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest):
self
.
use_mkldnn
=
False
self
.
fuse_relu_before_depthwise_conv
=
False
self
.
data_format
=
"AnyLayout"
# explicilty use float32 for ROCm, as MIOpen does not yet support float64
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
self
.
dtype
=
np
.
float64
self
.
init_kernel_type
()
self
.
init_group
()
self
.
init_dilation
()
...
...
@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
class
TestConv2DOpError
(
unittest
.
TestCase
):
...
...
@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest):
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
fuse_relu_before_depthwise_conv
=
False
# explicilty use float32 for ROCm, as MIOpen does not yet support float64
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
self
.
dtype
=
np
.
float64
self
.
init_kernel_type
()
self
.
init_group
()
self
.
init_dilation
()
...
...
python/paddle/fluid/tests/unittests/test_conv3d_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent):
class
TestCUDNNCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNN"
)
TestCUDNNCase
.
__name__
=
cls_name
...
...
@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class
TestCUDNNPaddingSMAECase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
,
1
]
...
...
@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class
TestCUDNNPaddingVALIDCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
,
1
]
...
...
@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent):
class
TestCudnnChannelLastCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_data_format
(
self
):
self
.
data_format
=
"NDHWC"
...
...
@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp):
class
TestCUDNN
(
TestConv3DOp
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp):
class
TestWithGroup1CUDNN
(
TestWithGroup1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1):
class
TestWithGroup2CUDNN
(
TestWithGroup2
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2):
class
TestWith1x1CUDNN
(
TestWith1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1):
class
TestWithInput1x1Filter1x1CUDNN
(
TestWithInput1x1Filter1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
# ---- test asymmetric padding ----
...
...
python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
def
setUp
(
self
):
"""Setup."""
#self.dtype = np.float32
self
.
dtype
=
np
.
float64
self
.
dtype
=
np
.
float
32
if
core
.
is_compiled_with_rocm
()
else
np
.
float
64
self
.
N
=
8
self
.
C
=
16
self
.
H
=
32
...
...
@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
moving_variance_name
=
'bn_moving_variance'
,
data_layout
=
layout
,
is_test
=
only_forward
)
bn
=
fluid
.
layers
.
cast
(
bn
,
'float64'
)
if
core
.
is_compiled_with_rocm
():
bn
=
fluid
.
layers
.
cast
(
bn
,
'float32'
)
else
:
bn
=
fluid
.
layers
.
cast
(
bn
,
'float64'
)
sigmoid
=
fluid
.
layers
.
sigmoid
(
bn
)
out
=
fluid
.
layers
.
reduce_sum
(
sigmoid
)
if
not
sync_bn
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录