Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
3d5aa9d1
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3d5aa9d1
编写于
3月 12, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] fix conv2d and conv3d op, test=develop (#31553)
上级
f302bb4f
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
298 addition
and
249 deletion
+298
-249
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+114
-101
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+107
-124
paddle/fluid/operators/conv_transpose_cudnn_op.cu
paddle/fluid/operators/conv_transpose_cudnn_op.cu
+32
-8
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+15
-10
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+11
-4
python/paddle/fluid/tests/unittests/test_conv3d_op.py
python/paddle/fluid/tests/unittests/test_conv3d_op.py
+14
-0
python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
...n/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+5
-2
未找到文件。
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
3d5aa9d1
...
...
@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
args
.
handle
=
handle
;
#ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
groups
);
#else
...
...
@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
args
.
cdesc
.
desc
(),
groups
));
groups
=
1
;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups
=
1
;
#endif
args
.
idesc
.
set
(
transformed_input
,
layout_format
);
args
.
wdesc
.
set
(
transformed_filter_channel
,
layout_format
,
groups
);
...
...
@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
algo
{};
using
search
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
cudnnConvolutionFwdAlgo_t
algo
{};
using
search
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
,
algo
);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
...
...
@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
data_algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
data_algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_algo
));
#endif
}
if
(
filter_grad
)
{
...
...
@@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
#endif
}
// ------------------- cudnn conv backward data ---------------------
...
...
@@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
VLOG
(
4
)
<<
"Conv_grad: use_addto = "
<<
ctx
.
Attr
<
bool
>
(
"use_addto"
);
if
(
input_grad
)
{
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
// When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used.
#ifdef PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args1
.
wdesc
.
desc
(),
filter_data
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
+
i
*
group_offset_in
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
odesc
.
desc
(),
output_grad_data
,
args1
.
wdesc
.
desc
(),
filter_data
,
args1
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_input_grad_data
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
transformed_input_grad_data
+
i
*
group_offset_in
));
},
workspace_size
);
#endif
}
#endif
if
(
!
is_sys_pad
)
{
std
::
vector
<
int
>
starts
(
transformed_input_channel
.
dims
().
size
(),
0
);
std
::
vector
<
int
>
axes
(
transformed_input_channel
.
dims
().
size
(),
0
);
...
...
@@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ScalingParamType
<
T
>
beta_filter
=
0.0
f
;
// ------------------- cudnn conv backward filter ---------------------
if
(
filter_grad
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
// Because beta is zero, it is unnecessary to reset filter_grad.
#ifdef PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
args2
.
idesc
.
desc
(),
input_data
+
i
*
group_offset_in
,
args2
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
i
*
group_offset_filter
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
output_grad_data
,
args2
.
idesc
.
desc
(),
input_data
,
args2
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
,
cudnn_workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
workspace_handle
.
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_grad_data
+
i
*
group_offset_filter
));
},
workspace_size
);
#endif
}
#endif
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
TransToChannelFirst
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
fwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
fwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_algo1
);
#endif
}
if
(
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
args2
.
handle
=
handle
;
args2
.
idesc
.
set
(
transformed_X
,
iwo_group
);
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_ddO_channel
,
iwo_group
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
fwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
fwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_algo2
));
#endif
}
}
...
...
@@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_ddX
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dW
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_dO_channel
,
iwo_group
);
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_algo
));
#endif
}
if
(
ddW
&&
dX
)
{
...
...
@@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search4
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
data_algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
#endif
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
...
...
@@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if
(
ddO
)
{
if
(
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args1
.
wdesc
.
desc
(),
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
&
beta
,
args1
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
ddx
,
args1
.
wdesc
.
desc
(),
w
,
args1
.
cdesc
.
desc
(),
fwd_algo1
,
&
beta
,
args1
.
odesc
.
desc
(),
transformed_ddy_channel
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif
}
#endif
}
if
(
ddW
)
{
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
+
i
*
group_offset_in
,
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
&
beta
,
args2
.
odesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
// MIOPEN ONLY support beta to be 0.0f
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args2
.
idesc
.
desc
(),
x
,
args2
.
wdesc
.
desc
(),
ddw
,
args2
.
cdesc
.
desc
(),
fwd_algo2
,
&
beta
,
args2
.
odesc
.
desc
(),
transformed_ddy_channel
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif
}
#endif
}
if
(
channel_last
)
{
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T
*
transformed_dy_channel
=
transformed_dO_channel
.
data
<
T
>
();
if
(
dW
&&
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args3
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args3
.
idesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args3
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args3
.
wdesc
.
desc
(),
dw
+
i
*
group_offset_filter
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args3
.
odesc
.
desc
(),
transformed_dy_channel
,
args3
.
idesc
.
desc
(),
ddx
,
args3
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args3
.
wdesc
.
desc
(),
dw
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
dw
+
i
*
group_offset_filter
));
},
workspace_size
);
#endif
}
#endif
}
if
(
dX
&&
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args4
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args4
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args4
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args4
.
idesc
.
desc
(),
transformed_dx
+
i
*
group_offset_in
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args4
.
odesc
.
desc
(),
transformed_dy_channel
,
args4
.
wdesc
.
desc
(),
ddw
,
args4
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args4
.
idesc
.
desc
(),
transformed_dx
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_dx
+
i
*
group_offset_in
));
},
workspace_size
);
#endif
}
#endif
if
(
!
is_sys_pad
)
{
// reverse padded input
...
...
paddle/fluid/operators/conv_miopen_helper.h
浏览文件 @
3d5aa9d1
...
...
@@ -127,57 +127,52 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
bool
has_got_workspace_size
=
true
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
&
temp
=
ctx
.
cuda_device_context
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"FwdAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
fwd_algo
;
}
return
perf_stat
[
0
].
fwd_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
fwd_algo
;
}
else
{
auto
&
temp
=
ctx
.
cuda_device_context
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
fwd_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
...
...
@@ -194,58 +189,51 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdDataAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_data_algo
;
}
return
perf_stat
[
0
].
bwd_data_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
bwd_data_algo
;
}
else
{
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
bwd_data_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
...
...
@@ -262,56 +250,51 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
bool
deterministic
,
size_t
workspace_size
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdFilterAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_weights_algo
;
}
return
perf_stat
[
0
].
bwd_weights_algo
;
});
int
find_count
;
miopenConvAlgoPerf_t
find_result
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
find_count
,
&
find_result
,
cudnn_workspace_ptr
,
workspace_size
,
false
));
};
if
(
!
exhaustive_search
&&
!
deterministic
)
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
algo
=
find_result
.
bwd_weights_algo
;
}
else
{
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size
);
return
find_result
.
bwd_weights_algo
;
});
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu
浏览文件 @
3d5aa9d1
...
...
@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using
search
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
algo
));
#endif
// ------------------- cudnn conv transpose forward ---------------------
int
input_offset
=
...
...
@@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_algo
));
#endif
}
if
(
filter_grad
)
{
...
...
@@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
#endif
}
// ------------------- cudnn conv backward data ---------------------
...
...
@@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
bwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_algo1
);
#endif
}
if
(
ddW
)
{
...
...
@@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
bwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_algo2
));
#endif
}
}
...
...
@@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_algo
));
#endif
}
if
(
ddW
&&
dX
)
{
...
...
@@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args4
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search4
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
using
search4
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
#endif
}
int
i_n
,
i_c
,
i_d
,
i_h
,
i_w
;
...
...
paddle/fluid/platform/miopen_desc.h
浏览文件 @
3d5aa9d1
...
...
@@ -199,19 +199,24 @@ class FilterDescriptor {
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
transformed_dims
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
// if (groups > 1) {
// transformed_dims[1] = transformed_dims[1] / groups;
// }
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSet4dTensorDescriptor
(
(
miopenTensorDescriptor_t
)
desc_
.
get
(),
ToCudnnDataType
(
tensor
.
type
()),
transformed_dims
[
0
],
transformed_dims
[
1
],
transformed_dims
[
2
],
transformed_dims
[
3
]));
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
for
(
int
i
=
dims
.
size
()
-
2
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dims
[
i
+
1
]
*
strides
[
i
+
1
];
}
std
::
vector
<
int
>
dims_with_group
(
dims
.
begin
(),
dims
.
end
());
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudnnDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent):
class
TestCUDNNCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNN"
)
TestCUDNNCase
.
__name__
=
cls_name
...
...
@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent):
class
TestCudnnChannelLastCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_data_format
(
self
):
self
.
data_format
=
"NHWC"
...
...
@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class
TestCUDNNPaddingSMAECase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
]
...
...
@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class
TestCUDNNPaddingVALIDCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
]
...
...
@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest):
self
.
use_mkldnn
=
False
self
.
fuse_relu_before_depthwise_conv
=
False
self
.
data_format
=
"AnyLayout"
# explicilty use float32 for ROCm, as MIOpen does not yet support float64
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
self
.
dtype
=
np
.
float64
self
.
init_kernel_type
()
self
.
init_group
()
self
.
init_dilation
()
...
...
@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
class
TestConv2DOpError
(
unittest
.
TestCase
):
...
...
@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest):
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
fuse_relu_before_depthwise_conv
=
False
# explicilty use float32 for ROCm, as MIOpen does not yet support float64
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
self
.
dtype
=
np
.
float64
self
.
init_kernel_type
()
self
.
init_group
()
self
.
init_dilation
()
...
...
python/paddle/fluid/tests/unittests/test_conv3d_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent):
class
TestCUDNNCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"CUDNN"
)
TestCUDNNCase
.
__name__
=
cls_name
...
...
@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class
TestCUDNNPaddingSMAECase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
,
1
]
...
...
@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class
TestCUDNNPaddingVALIDCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_paddings
(
self
):
self
.
pad
=
[
1
,
1
,
1
]
...
...
@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent):
class
TestCudnnChannelLastCase
(
parent
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
(
)
else
np
.
float64
def
init_data_format
(
self
):
self
.
data_format
=
"NDHWC"
...
...
@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp):
class
TestCUDNN
(
TestConv3DOp
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp):
class
TestWithGroup1CUDNN
(
TestWithGroup1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1):
class
TestWithGroup2CUDNN
(
TestWithGroup2
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2):
class
TestWith1x1CUDNN
(
TestWith1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1):
class
TestWithInput1x1Filter1x1CUDNN
(
TestWithInput1x1Filter1x1
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
...
...
@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
self
.
dtype
=
np
.
float32
if
core
.
is_compiled_with_rocm
()
else
np
.
float64
# ---- test asymmetric padding ----
...
...
python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
浏览文件 @
3d5aa9d1
...
...
@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
def
setUp
(
self
):
"""Setup."""
#self.dtype = np.float32
self
.
dtype
=
np
.
float64
self
.
dtype
=
np
.
float
32
if
core
.
is_compiled_with_rocm
()
else
np
.
float
64
self
.
N
=
8
self
.
C
=
16
self
.
H
=
32
...
...
@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
moving_variance_name
=
'bn_moving_variance'
,
data_layout
=
layout
,
is_test
=
only_forward
)
bn
=
fluid
.
layers
.
cast
(
bn
,
'float64'
)
if
core
.
is_compiled_with_rocm
():
bn
=
fluid
.
layers
.
cast
(
bn
,
'float32'
)
else
:
bn
=
fluid
.
layers
.
cast
(
bn
,
'float64'
)
sigmoid
=
fluid
.
layers
.
sigmoid
(
bn
)
out
=
fluid
.
layers
.
reduce_sum
(
sigmoid
)
if
not
sync_bn
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录