未验证 提交 3d5aa9d1 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] fix conv2d and conv3d op, test=develop (#31553)

上级 f302bb4f
...@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
args.handle = handle; args.handle = handle;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN need to set groups in cdesc in miopen_desc.h
args.cdesc.set(dtype, padding_common, strides, dilations, args.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), groups); platform::AllowTF32Cudnn(), groups);
#else #else
...@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
groups)); groups));
groups = 1; groups = 1;
#endif
#ifdef PADDLE_WITH_HIP
// MIOPEN do not set groups in wdesc after set groups in cdesc
groups = 1;
#endif #endif
args.idesc.set(transformed_input, layout_format); args.idesc.set(transformed_input, layout_format);
args.wdesc.set(transformed_filter_channel, layout_format, groups); args.wdesc.set(transformed_filter_channel, layout_format, groups);
...@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t algo{}; miopenConvFwdAlgorithm_t algo{};
using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args);
algo = search::Find<T>(args, exhaustive_search, false, workspace_size, ctx);
#else #else
cudnnConvolutionFwdAlgo_t algo{}; cudnnConvolutionFwdAlgo_t algo{};
using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
#endif
algo = search::Find<T>(args, exhaustive_search, false, ctx); algo = search::Find<T>(args, exhaustive_search, false, ctx);
workspace_size = search::GetWorkspaceSize(args, algo); workspace_size = search::GetWorkspaceSize(args, algo);
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
// when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
...@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
#endif
data_algo = data_algo =
search1::Find<T>(args1, exhaustive_search, deterministic, ctx); search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif
} }
if (filter_grad) { if (filter_grad) {
...@@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform::AllowTF32Cudnn(), c_groups); platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
#endif
filter_algo = filter_algo =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx); search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo)); search2::GetWorkspaceSize(args2, filter_algo));
#endif
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
...@@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto"); VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
if (input_grad) { if (input_grad) {
// When beta is 0, it is unnecessary to reset input_grad. // When beta is 0, it is unnecessary to reset input_grad.
// When beta is 1, the output cannot be reset since addt strategy used. // When beta is 1, the output cannot be reset since addt strategy used.
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardData( platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args1.odesc.desc(), handle, &alpha, args1.odesc.desc(), output_grad_data,
output_grad_data + i * group_offset_out, args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
args1.wdesc.desc(), filter_data + i * group_offset_filter, data_algo, &beta, args1.idesc.desc(),
args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(), transformed_input_grad_data, cudnn_workspace_ptr,
transformed_input_grad_data + i * group_offset_in, workspace_size));
cudnn_workspace_ptr, workspace_size)); },
}, workspace_size);
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
transformed_input_grad_data + i * group_offset_in)); transformed_input_grad_data + i * group_offset_in));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
if (!is_sys_pad) { if (!is_sys_pad) {
std::vector<int> starts(transformed_input_channel.dims().size(), 0); std::vector<int> starts(transformed_input_channel.dims().size(), 0);
std::vector<int> axes(transformed_input_channel.dims().size(), 0); std::vector<int> axes(transformed_input_channel.dims().size(), 0);
...@@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
ScalingParamType<T> beta_filter = 0.0f; ScalingParamType<T> beta_filter = 0.0f;
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights( platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args2.odesc.desc(), handle, &alpha, args2.odesc.desc(), output_grad_data,
output_grad_data + i * group_offset_out, args2.idesc.desc(), input_data, args2.cdesc.desc(),
args2.idesc.desc(), input_data + i * group_offset_in, filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
args2.cdesc.desc(), filter_algo, &beta, cudnn_workspace_ptr, workspace_size));
args2.wdesc.desc(), },
filter_grad_data + i * group_offset_filter, workspace_size);
cudnn_workspace_ptr, workspace_size));
},
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
[&](void* cudnn_workspace_ptr) { [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_grad_data + i * group_offset_filter)); filter_grad_data + i * group_offset_filter));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
if (compute_format == DataLayout::kNHWC) { if (compute_format == DataLayout::kNHWC) {
TransToChannelFirst<paddle::platform::CUDADeviceContext, T>( TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
...@@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
#endif
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx); fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
#endif
} }
if (ddW) { if (ddW) {
ddw = ddW->data<T>(); ddw = ddW->data<T>();
args2.handle = handle; args2.handle = handle;
args2.idesc.set(transformed_X, iwo_group); args2.idesc.set(transformed_X, iwo_group);
args2.wdesc.set(*ddW, layout, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group);
args2.odesc.set(transformed_ddO_channel, iwo_group); args2.odesc.set(transformed_ddO_channel, iwo_group);
args2.cdesc.set(dtype, padding_common, strides, dilations, args2.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group); platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
#endif
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx); fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, fwd_algo2)); search2::GetWorkspaceSize(args2, fwd_algo2));
#endif
} }
} }
...@@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
args3.handle = handle; args3.handle = handle;
args3.idesc.set(transformed_ddX, iwo_group); args3.idesc.set(transformed_ddX, iwo_group);
args3.wdesc.set(*dW, layout, iwo_group); args3.wdesc.set(*dW, layout, iwo_group);
args3.odesc.set(transformed_dO_channel, iwo_group); args3.odesc.set(transformed_dO_channel, iwo_group);
args3.cdesc.set(dtype, padding_common, strides, dilations, args3.cdesc.set(dtype, padding_common, strides, dilations,
platform::AllowTF32Cudnn(), c_group); platform::AllowTF32Cudnn(), c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
workspace_size, ctx);
#else #else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
#endif
filter_algo = filter_algo =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx); search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo)); search3::GetWorkspaceSize(args3, filter_algo));
#endif
} }
if (ddW && dX) { if (ddW && dX) {
...@@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
workspace_size, ctx);
#else #else
using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
#endif
data_algo = data_algo =
search4::Find<T>(args4, exhaustive_search, deterministic, ctx); search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
} }
int i_n, i_c, i_d, i_h, i_w; int i_n, i_c, i_d, i_h, i_w;
...@@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
if (ddO) { if (ddO) {
if (ddX) { if (ddX) {
ddx = transformed_ddX.data<T>(); ddx = transformed_ddX.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionForward( platform::dynload::miopenConvolutionForward(
handle, &alpha, args1.idesc.desc(), handle, &alpha, args1.idesc.desc(), ddx,
ddx + i * group_offset_in, args1.wdesc.desc(), args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
w + i * group_offset_filter, args1.cdesc.desc(), &beta, args1.odesc.desc(), transformed_ddy_channel,
fwd_algo1, &beta, args1.odesc.desc(), workspace_ptr, workspace_size));
transformed_ddy_channel + i * group_offset_out, },
workspace_ptr, workspace_size)); workspace_size);
},
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel + i * group_offset_out)); transformed_ddy_channel + i * group_offset_out));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
} }
if (ddW) { if (ddW) {
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// MIOPEN ONLY support beta to be 0.0f // MIOPEN ONLY support beta to be 0.0f
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionForward( platform::dynload::miopenConvolutionForward(
handle, &alpha, args2.idesc.desc(), handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
x + i * group_offset_in, args2.wdesc.desc(), ddw, args2.cdesc.desc(), fwd_algo2, &beta,
ddw + i * group_offset_filter, args2.cdesc.desc(), args2.odesc.desc(), transformed_ddy_channel,
fwd_algo2, &beta, args2.odesc.desc(), workspace_ptr, workspace_size));
transformed_ddy_channel + i * group_offset_out, },
workspace_ptr, workspace_size)); workspace_size);
},
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel + i * group_offset_out)); transformed_ddy_channel + i * group_offset_out));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
} }
if (channel_last) { if (channel_last) {
TransToChannelLast<paddle::platform::CUDADeviceContext, T>( TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
...@@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T* transformed_dy_channel = transformed_dO_channel.data<T>(); T* transformed_dy_channel = transformed_dO_channel.data<T>();
if (dW && ddX) { if (dW && ddX) {
ddx = transformed_ddX.data<T>(); ddx = transformed_ddX.data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeights( platform::dynload::miopenConvolutionBackwardWeights(
handle, &alpha, args3.odesc.desc(), handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
transformed_dy_channel + i * group_offset_out, args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
args3.idesc.desc(), ddx + i * group_offset_in, &beta, args3.wdesc.desc(), dw, workspace_ptr,
args3.cdesc.desc(), filter_algo, &beta, workspace_size));
args3.wdesc.desc(), dw + i * group_offset_filter, },
workspace_ptr, workspace_size)); workspace_size);
},
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
dw + i * group_offset_filter)); dw + i * group_offset_filter));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
} }
if (dX && ddW) { if (dX && ddW) {
ddw = ddW->data<T>(); ddw = ddW->data<T>();
for (int i = 0; i < groups; i++) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardData( platform::dynload::miopenConvolutionBackwardData(
handle, &alpha, args4.odesc.desc(), handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
transformed_dy_channel + i * group_offset_out, args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
args4.wdesc.desc(), ddw + i * group_offset_filter, &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
args4.cdesc.desc(), data_algo, &beta, args4.idesc.desc(), workspace_size));
transformed_dx + i * group_offset_in, workspace_ptr, },
workspace_size)); workspace_size);
},
workspace_size);
#else #else
for (int i = 0; i < groups; i++) {
wkspace_handle.RunFunc( wkspace_handle.RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_dx + i * group_offset_in)); transformed_dx + i * group_offset_in));
}, },
workspace_size); workspace_size);
#endif
} }
#endif
if (!is_sys_pad) { if (!is_sys_pad) {
// reverse padded input // reverse padded input
......
...@@ -127,57 +127,52 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { ...@@ -127,57 +127,52 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
bool has_got_workspace_size = true;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
auto& temp = ctx.cuda_device_context(); int find_count;
AlgorithmsCache<algo_t>& algo_cache = miopenConvAlgoPerf_t find_result;
*(framework::ConvSearchCache::Instance().GetForward()); auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_CUDA_SUCCESS(
auto x_dims = framework::vectorize(args.x->dims()); platform::dynload::miopenFindConvolutionForwardAlgorithm(
auto w_dims = framework::vectorize(args.w->dims()); args.handle, args.idesc.desc(), args.x->data<T>(),
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
<< args.s << ", args.p" << args.p << ", args.d" << args.d; cudnn_workspace_ptr, workspace_size, false));
};
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0, if (!exhaustive_search && !deterministic) {
static_cast<int64_t>(args.cudnn_dtype), [&]() { workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
int returned_algo_count; algo = find_result.fwd_algo;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; } else {
auto& temp = ctx.cuda_device_context();
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { AlgorithmsCache<algo_t>& algo_cache =
PADDLE_ENFORCE_CUDA_SUCCESS( *(framework::ConvSearchCache::Instance().GetForward());
platform::dynload::miopenFindConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.x->data<T>(), auto x_dims = framework::vectorize(args.x->dims());
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(), auto w_dims = framework::vectorize(args.w->dims());
args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_stat.data(), VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
cudnn_workspace_ptr, workspace_size_limit, false)); << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
}; << args.s << ", args.p" << args.p << ", args.d" << args.d;
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
algo = algo_cache.GetAlgorithm(
VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; x_dims, w_dims, args.s, args.p, args.d, 0,
for (int i = 0; i < returned_algo_count; ++i) { static_cast<int64_t>(args.cudnn_dtype), [&]() {
const auto& stat = perf_stat[i]; workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
VLOG(3) << stat.fwd_algo; return find_result.fwd_algo;
} });
return perf_stat[0].fwd_algo; }
});
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
return algo; return algo;
} }
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
...@@ -194,58 +189,51 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> { ...@@ -194,58 +189,51 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
bool has_got_workspace_size = true;
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache = int find_count;
*(framework::ConvSearchCache::Instance().GetBackwardData()); miopenConvAlgoPerf_t find_result;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
auto x_dims = framework::vectorize(args.x->dims()); PADDLE_ENFORCE_CUDA_SUCCESS(
auto w_dims = framework::vectorize(args.w->dims()); platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
args.handle, args.odesc.desc(), args.o->data<T>(),
VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
<< args.s << ", args.p" << args.p << ", args.d" << args.d; kNUM_CUDNN_BWD_DATA_ALGS, &find_count, &find_result,
cudnn_workspace_ptr, workspace_size, false));
algo = algo_cache.GetAlgorithm( };
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() { if (!exhaustive_search && !deterministic) {
int returned_algo_count; workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; algo = find_result.bwd_data_algo;
} else {
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { AlgorithmsCache<algo_t>& algo_cache =
PADDLE_ENFORCE_CUDA_SUCCESS( *(framework::ConvSearchCache::Instance().GetBackwardData());
platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
args.handle, args.odesc.desc(), args.o->data<T>(), auto x_dims = framework::vectorize(args.x->dims());
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(), auto w_dims = framework::vectorize(args.w->dims());
args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, VLOG(10) << "miopenConvolutionFwdAlgoPerf_t"
perf_stat.data(), cudnn_workspace_ptr, workspace_size_limit, << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
false)); << args.s << ", args.p" << args.p << ", args.d" << args.d;
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; static_cast<int64_t>(args.cudnn_dtype), [&]() {
for (int i = 0; i < returned_algo_count; ++i) { workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
const auto& stat = perf_stat[i]; return find_result.bwd_data_algo;
VLOG(3) << stat.bwd_data_algo; });
} }
return perf_stat[0].bwd_data_algo;
});
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
return algo; return algo;
} }
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
...@@ -262,56 +250,51 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> { ...@@ -262,56 +250,51 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, bool deterministic, size_t workspace_size,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
bool has_got_workspace_size = true;
algo_t algo; algo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardFilter()); int find_count;
miopenConvAlgoPerf_t find_result;
auto x_dims = framework::vectorize(args.x->dims()); auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
auto w_dims = framework::vectorize(args.w->dims()); PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" args.handle, args.odesc.desc(), args.o->data<T>(),
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
<< args.s << ", args.p" << args.p << ", args.d" << args.d; args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
kNUM_CUDNN_BWD_FILTER_ALGS, &find_count, &find_result,
algo = algo_cache.GetAlgorithm( cudnn_workspace_ptr, workspace_size, false));
x_dims, w_dims, args.s, args.p, args.d, 0, };
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count; if (!exhaustive_search && !deterministic) {
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat; workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { algo = find_result.bwd_weights_algo;
PADDLE_ENFORCE_CUDA_SUCCESS( } else {
platform::dynload:: AlgorithmsCache<algo_t>& algo_cache =
miopenFindConvolutionBackwardWeightsAlgorithm( *(framework::ConvSearchCache::Instance().GetBackwardFilter());
args.handle, args.odesc.desc(), args.o->data<T>(),
args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(), auto x_dims = framework::vectorize(args.x->dims());
args.wdesc.desc(), const_cast<T*>(args.w->data<T>()), auto w_dims = framework::vectorize(args.w->dims());
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr, VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
workspace_size_limit, false)); << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
}; << args.s << ", args.p" << args.p << ", args.d" << args.d;
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
algo = algo_cache.GetAlgorithm(
VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; x_dims, w_dims, args.s, args.p, args.d, 0,
for (int i = 0; i < returned_algo_count; ++i) { static_cast<int64_t>(args.cudnn_dtype), [&]() {
const auto& stat = perf_stat[i]; workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
VLOG(3) << stat.bwd_weights_algo; return find_result.bwd_weights_algo;
} });
return perf_stat[0].bwd_weights_algo; }
});
VLOG(3) << "choose algo " << algo; VLOG(3) << "choose algo " << algo;
return algo; return algo;
} }
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static size_t GetWorkspaceSize(const ConvArgs& args) {
size_t workspace_size = 0; size_t workspace_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
......
...@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else #else
using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
#endif
algo = search::Find<T>(args, false, deterministic, ctx); algo = search::Find<T>(args, false, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo)); std::max(workspace_size, search::GetWorkspaceSize(args, algo));
#endif
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
int input_offset = int input_offset =
...@@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform::AllowTF32Cudnn(), c_groups); platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
#endif
data_algo = search1::Find<T>(args1, false, deterministic, ctx); data_algo = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
#endif
} }
if (filter_grad) { if (filter_grad) {
...@@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
platform::AllowTF32Cudnn(), c_groups); platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
#endif
filter_algo = search2::Find<T>(args2, false, deterministic, ctx); filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo)); search2::GetWorkspaceSize(args2, filter_algo));
#endif
} }
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
...@@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
#endif
bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx); bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
#endif
} }
if (ddW) { if (ddW) {
...@@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
#endif
bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx); bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, bwd_algo2)); search2::GetWorkspaceSize(args2, bwd_algo2));
#endif
} }
} }
...@@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); args3.cdesc.set(dtype, padding_common, strides, dilations, c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else #else
using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
#endif
filter_algo = search3::Find<T>(args3, false, deterministic, ctx); filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size = std::max(workspace_size,
search3::GetWorkspaceSize(args3, filter_algo)); search3::GetWorkspaceSize(args3, filter_algo));
#endif
} }
if (ddW && dX) { if (ddW && dX) {
...@@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); args4.cdesc.set(dtype, padding_common, strides, dilations, c_group);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else #else
using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
#endif
data_algo = search4::Find<T>(args4, false, deterministic, ctx); data_algo = search4::Find<T>(args4, false, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
#endif
} }
int i_n, i_c, i_d, i_h, i_w; int i_n, i_c, i_d, i_h, i_w;
......
...@@ -199,19 +199,24 @@ class FilterDescriptor { ...@@ -199,19 +199,24 @@ class FilterDescriptor {
void set(const Tensor& tensor, const miopenTensorFormat_t format, void set(const Tensor& tensor, const miopenTensorFormat_t format,
const int groups = 1) { const int groups = 1) {
auto dims = framework::vectorize<int>(tensor.dims());
std::vector<int> transformed_dims;
PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW, PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"format should ONLY be NCHW in MIOPEN.")); "format should ONLY be NCHW in MIOPEN."));
transformed_dims = dims; auto dims = framework::vectorize<int>(tensor.dims());
// if (groups > 1) { std::vector<int> strides(dims.size());
// transformed_dims[1] = transformed_dims[1] / groups; strides[dims.size() - 1] = 1;
// } for (int i = dims.size() - 2; i >= 0; i--) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet4dTensorDescriptor( strides[i] = dims[i + 1] * strides[i + 1];
(miopenTensorDescriptor_t)desc_.get(), ToCudnnDataType(tensor.type()), }
transformed_dims[0], transformed_dims[1], transformed_dims[2], std::vector<int> dims_with_group(dims.begin(), dims.end());
transformed_dims[3])); if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
(miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
static_cast<int>(dims_with_group.size()),
const_cast<int*>(dims_with_group.data()),
const_cast<int*>(strides.data())));
} }
private: private:
......
...@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent): ...@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent):
class TestCUDNNCase(parent): class TestCUDNNCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
cls_name = "{0}_{1}".format(parent.__name__, "CUDNN") cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
TestCUDNNCase.__name__ = cls_name TestCUDNNCase.__name__ = cls_name
...@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent): ...@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent):
class TestCudnnChannelLastCase(parent): class TestCudnnChannelLastCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_data_format(self): def init_data_format(self):
self.data_format = "NHWC" self.data_format = "NHWC"
...@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent): ...@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class TestCUDNNPaddingSMAECase(parent): class TestCUDNNPaddingSMAECase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_paddings(self): def init_paddings(self):
self.pad = [1, 1] self.pad = [1, 1]
...@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent): ...@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class TestCUDNNPaddingVALIDCase(parent): class TestCUDNNPaddingVALIDCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_paddings(self): def init_paddings(self):
self.pad = [1, 1] self.pad = [1, 1]
...@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest): ...@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest):
self.use_mkldnn = False self.use_mkldnn = False
self.fuse_relu_before_depthwise_conv = False self.fuse_relu_before_depthwise_conv = False
self.data_format = "AnyLayout" self.data_format = "AnyLayout"
# explicilty use float32 for ROCm, as MIOpen does not yet support float64 self.dtype = np.float64
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
self.init_kernel_type() self.init_kernel_type()
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
...@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp): ...@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.exhaustive_search = True self.exhaustive_search = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
class TestConv2DOpError(unittest.TestCase): class TestConv2DOpError(unittest.TestCase):
...@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest): ...@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest):
self.use_cuda = False self.use_cuda = False
self.use_mkldnn = False self.use_mkldnn = False
self.fuse_relu_before_depthwise_conv = False self.fuse_relu_before_depthwise_conv = False
# explicilty use float32 for ROCm, as MIOpen does not yet support float64 self.dtype = np.float64
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
self.init_kernel_type() self.init_kernel_type()
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
......
...@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent): ...@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent):
class TestCUDNNCase(parent): class TestCUDNNCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
cls_name = "{0}_{1}".format(parent.__name__, "CUDNN") cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
TestCUDNNCase.__name__ = cls_name TestCUDNNCase.__name__ = cls_name
...@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent): ...@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent):
class TestCUDNNPaddingSMAECase(parent): class TestCUDNNPaddingSMAECase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_paddings(self): def init_paddings(self):
self.pad = [1, 1, 1] self.pad = [1, 1, 1]
...@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent): ...@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent):
class TestCUDNNPaddingVALIDCase(parent): class TestCUDNNPaddingVALIDCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_paddings(self): def init_paddings(self):
self.pad = [1, 1, 1] self.pad = [1, 1, 1]
...@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent): ...@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent):
class TestCudnnChannelLastCase(parent): class TestCudnnChannelLastCase(parent):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm(
) else np.float64
def init_data_format(self): def init_data_format(self):
self.data_format = "NDHWC" self.data_format = "NDHWC"
...@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp): ...@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp):
class TestCUDNN(TestConv3DOp): class TestCUDNN(TestConv3DOp):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
...@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp): ...@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp):
class TestWithGroup1CUDNN(TestWithGroup1): class TestWithGroup1CUDNN(TestWithGroup1):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
...@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1): ...@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1):
class TestWithGroup2CUDNN(TestWithGroup2): class TestWithGroup2CUDNN(TestWithGroup2):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
...@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2): ...@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2):
class TestWith1x1CUDNN(TestWith1x1): class TestWith1x1CUDNN(TestWith1x1):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
...@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1): ...@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1):
class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
...@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN): ...@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.exhaustive_search = True self.exhaustive_search = True
self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
# ---- test asymmetric padding ---- # ---- test asymmetric padding ----
......
...@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): ...@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
def setUp(self): def setUp(self):
"""Setup.""" """Setup."""
#self.dtype = np.float32 #self.dtype = np.float32
self.dtype = np.float64 self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
self.N = 8 self.N = 8
self.C = 16 self.C = 16
self.H = 32 self.H = 32
...@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): ...@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
moving_variance_name='bn_moving_variance', moving_variance_name='bn_moving_variance',
data_layout=layout, data_layout=layout,
is_test=only_forward) is_test=only_forward)
bn = fluid.layers.cast(bn, 'float64') if core.is_compiled_with_rocm():
bn = fluid.layers.cast(bn, 'float32')
else:
bn = fluid.layers.cast(bn, 'float64')
sigmoid = fluid.layers.sigmoid(bn) sigmoid = fluid.layers.sigmoid(bn)
out = fluid.layers.reduce_sum(sigmoid) out = fluid.layers.reduce_sum(sigmoid)
if not sync_bn: if not sync_bn:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册