未验证 提交 224bc511 编写于 作者: C crystal 提交者: GitHub

Modify the unsqueeze dimension of input data in conv1d NCL And NLC format (#38425)

* optimize conv1d forward

* add conv opt

* Optimize memory copy

* delete share data with

* set num_filters=512

* add nlc optimize

* Optimize num_filter=512 data on A100 and V100

* Fix the workspace_size size setting of filter
上级 59c7aea5
...@@ -638,7 +638,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -638,7 +638,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo = cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0); static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif #endif
size_t workspace_size = 0; // input data workspace_size
size_t workspace_size_d = 0;
// weight workspace_size
size_t workspace_size_w = 0;
int iwo_groups = groups; int iwo_groups = groups;
int c_groups = 1; int c_groups = 1;
...@@ -661,16 +664,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -661,16 +664,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = workspace_size_d =
std::max(workspace_size, search1::GetWorkspaceSize(args1)); std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
workspace_size, ctx); workspace_size_d, ctx);
#else #else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo = data_algo =
search1::Find<T>(args1, exhaustive_search, deterministic, ctx); search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size = workspace_size_d = std::max(workspace_size_d,
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); search1::GetWorkspaceSize(args1, data_algo));
#endif #endif
} }
...@@ -686,16 +689,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -686,16 +689,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform::AllowTF32Cudnn(), c_groups); platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = workspace_size_w =
std::max(workspace_size, search2::GetWorkspaceSize(args2)); std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic, filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
workspace_size, ctx); workspace_size_w, ctx);
#else #else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = filter_algo =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx); search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size = std::max(workspace_size, workspace_size_w = std::max(
search2::GetWorkspaceSize(args2, filter_algo)); workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
#endif #endif
} }
...@@ -726,9 +729,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -726,9 +729,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
handle, &alpha, args1.odesc.desc(), output_grad_data, handle, &alpha, args1.odesc.desc(), output_grad_data,
args1.wdesc.desc(), filter_data, args1.cdesc.desc(), args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(), temp_tensor_data, data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
cudnn_workspace_ptr, workspace_size)); cudnn_workspace_ptr, workspace_size_d));
}, },
workspace_size); workspace_size_d);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
transformed_input_grad_data, &alpha, args1.idesc.desc(), transformed_input_grad_data, &alpha, args1.idesc.desc(),
...@@ -743,9 +746,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -743,9 +746,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
args1.wdesc.desc(), filter_data, args1.cdesc.desc(), args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(), data_algo, &beta, args1.idesc.desc(),
transformed_input_grad_data, cudnn_workspace_ptr, transformed_input_grad_data, cudnn_workspace_ptr,
workspace_size)); workspace_size_d));
}, },
workspace_size); workspace_size_d);
} }
#else #else
...@@ -758,10 +761,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -758,10 +761,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_data + i * group_offset_filter, args1.odesc.desc(), filter_data + i * group_offset_filter, args1.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
workspace_size, &beta, args1.idesc.desc(), workspace_size_d, &beta, args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in)); transformed_input_grad_data + i * group_offset_in));
}, },
workspace_size); workspace_size_d);
} }
#endif #endif
if (!is_sys_pad) { if (!is_sys_pad) {
...@@ -804,9 +807,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -804,9 +807,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
handle, &alpha, args2.odesc.desc(), output_grad_data, handle, &alpha, args2.odesc.desc(), output_grad_data,
args2.idesc.desc(), input_data, args2.cdesc.desc(), args2.idesc.desc(), input_data, args2.cdesc.desc(),
filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
cudnn_workspace_ptr, workspace_size)); cudnn_workspace_ptr, workspace_size_w));
}, },
workspace_size); workspace_size_w);
#else #else
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc( workspace_handle.RunFunc(
...@@ -817,10 +820,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -817,10 +820,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_data + i * group_offset_in, args2.odesc.desc(), input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size, &beta_filter, args2.wdesc.desc(), workspace_size_w, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter)); filter_grad_data + i * group_offset_filter));
}, },
workspace_size); workspace_size_w);
} }
#endif #endif
......
...@@ -165,11 +165,20 @@ def add_cases(suite): ...@@ -165,11 +165,20 @@ def add_cases(suite):
suite.addTest( suite.addTest(
Conv1DTestCase( Conv1DTestCase(
methodName='runTest', filter_size=3, padding='valid')) methodName='runTest', filter_size=3, padding='valid'))
suite.addTest(
Conv1DTestCase(
methodName='runTest', num_filters=512, padding='valid'))
suite.addTest(
Conv1DTestCase(
methodName='runTest', num_filters=512, padding=[1, 2]))
suite.addTest( suite.addTest(
Conv1DTestCase( Conv1DTestCase(
methodName='runTest', padding=2, data_format='NLC')) methodName='runTest', padding=2, data_format='NLC'))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1])) suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1]))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2])) suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2]))
suite.addTest(
Conv1DTestCase(
methodName='runTest', padding=[1, 2], data_format='NLC'))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=2)) suite.addTest(Conv1DTestCase(methodName='runTest', padding=2))
suite.addTest(Conv1DTestCase(methodName='runTest')) suite.addTest(Conv1DTestCase(methodName='runTest'))
suite.addTest( suite.addTest(
...@@ -204,6 +213,12 @@ def add_error_cases(suite): ...@@ -204,6 +213,12 @@ def add_error_cases(suite):
suite.addTest( suite.addTest(
Conv1DErrorTestCase( Conv1DErrorTestCase(
methodName='runTest', padding=[1, 2, 3, 4, 5])) methodName='runTest', padding=[1, 2, 3, 4, 5]))
suite.addTest(
Conv1DErrorTestCase(
methodName='runTest', padding=[1, 2, 3, 4, 5], data_format='NLC'))
suite.addTest(
Conv1DErrorTestCase(
methodName='runTest', num_filters=512, padding=[1, 2, 3, 4, 5]))
suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10)) suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10))
......
...@@ -326,21 +326,24 @@ def conv1d(x, ...@@ -326,21 +326,24 @@ def conv1d(x,
# update attrs # update attrs
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1) padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
if len(padding) == 2: if len(padding) == 2:
padding = padding + [0] * 2 padding = [0] * 2 + padding
elif len(padding) == 1: elif len(padding) == 1:
padding = padding + [0] padding = [0] + padding
else: else:
raise ValueError( raise ValueError(
"The size of padding's dimension should be 1 or 2. But got padding={}". "The size of padding's dimension should be 1 or 2. But got padding={}".
format(padding)) format(padding))
stride = [1] + convert_to_list(stride, 1, 'stride')
stride = convert_to_list(stride, 1, 'stride') + [1] dilation = [1] + convert_to_list(dilation, 1, 'dilation')
dilation = convert_to_list(dilation, 1, 'dilation') + [1] weight = unsqueeze(weight, axis=[-2])
l_type = "conv2d" l_type = "conv2d"
if (num_channels == groups and num_channels != 1 and
num_filters % num_channels == 0 and not use_cudnn): # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
if (core.is_compiled_with_cuda() and num_channels == groups and
num_channels != 1 and num_filters % num_channels == 0):
l_type = 'depthwise_conv2d' l_type = 'depthwise_conv2d'
use_cudnn = False use_cudnn = False
...@@ -351,9 +354,9 @@ def conv1d(x, ...@@ -351,9 +354,9 @@ def conv1d(x,
else: else:
l_type = 'conv2d' l_type = 'conv2d'
squeeze_aixs = -2 if channel_last else -1 squeeze_aixs = -3 if channel_last else -2
x = unsqueeze(x, axis=[squeeze_aixs]) x = unsqueeze(x, axis=[squeeze_aixs])
weight = unsqueeze(weight, axis=[-1])
if in_dygraph_mode(): if in_dygraph_mode():
attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False, 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册