未验证 提交 224bc511 编写于 作者: C crystal 提交者: GitHub

Modify the unsqueeze dimension of input data in conv1d NCL And NLC format (#38425)

* optimize conv1d forward

* add conv opt

* Optimize memory copy

* delete share data with

* set num_filters=512

* add nlc optimize

* Optimize num_filter=512 data on A100 and V100

* Fix the workspace_size size setting of filter
上级 59c7aea5
......@@ -638,7 +638,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif
size_t workspace_size = 0;
// input data workspace_size
size_t workspace_size_d = 0;
// weight workspace_size
size_t workspace_size_w = 0;
int iwo_groups = groups;
int c_groups = 1;
......@@ -661,16 +664,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_HIP
using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1));
workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
workspace_size, ctx);
workspace_size_d, ctx);
#else
using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo =
search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size =
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
workspace_size_d = std::max(workspace_size_d,
search1::GetWorkspaceSize(args1, data_algo));
#endif
}
......@@ -686,16 +689,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform::AllowTF32Cudnn(), c_groups);
#ifdef PADDLE_WITH_HIP
using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2));
workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
workspace_size, ctx);
workspace_size_w, ctx);
#else
using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size = std::max(workspace_size,
search2::GetWorkspaceSize(args2, filter_algo));
workspace_size_w = std::max(
workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
#endif
}
......@@ -726,9 +729,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
handle, &alpha, args1.odesc.desc(), output_grad_data,
args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
cudnn_workspace_ptr, workspace_size));
cudnn_workspace_ptr, workspace_size_d));
},
workspace_size);
workspace_size_d);
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
transformed_input_grad_data, &alpha, args1.idesc.desc(),
......@@ -743,9 +746,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
data_algo, &beta, args1.idesc.desc(),
transformed_input_grad_data, cudnn_workspace_ptr,
workspace_size));
workspace_size_d));
},
workspace_size);
workspace_size_d);
}
#else
......@@ -758,10 +761,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_data + i * group_offset_filter, args1.odesc.desc(),
output_grad_data + i * group_offset_out,
args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
workspace_size, &beta, args1.idesc.desc(),
workspace_size_d, &beta, args1.idesc.desc(),
transformed_input_grad_data + i * group_offset_in));
},
workspace_size);
workspace_size_d);
}
#endif
if (!is_sys_pad) {
......@@ -804,9 +807,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
handle, &alpha, args2.odesc.desc(), output_grad_data,
args2.idesc.desc(), input_data, args2.cdesc.desc(),
filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
cudnn_workspace_ptr, workspace_size));
cudnn_workspace_ptr, workspace_size_w));
},
workspace_size);
workspace_size_w);
#else
for (int i = 0; i < groups; i++) {
workspace_handle.RunFunc(
......@@ -817,10 +820,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
input_data + i * group_offset_in, args2.odesc.desc(),
output_grad_data + i * group_offset_out,
args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
workspace_size, &beta_filter, args2.wdesc.desc(),
workspace_size_w, &beta_filter, args2.wdesc.desc(),
filter_grad_data + i * group_offset_filter));
},
workspace_size);
workspace_size_w);
}
#endif
......
......@@ -165,11 +165,20 @@ def add_cases(suite):
suite.addTest(
Conv1DTestCase(
methodName='runTest', filter_size=3, padding='valid'))
suite.addTest(
Conv1DTestCase(
methodName='runTest', num_filters=512, padding='valid'))
suite.addTest(
Conv1DTestCase(
methodName='runTest', num_filters=512, padding=[1, 2]))
suite.addTest(
Conv1DTestCase(
methodName='runTest', padding=2, data_format='NLC'))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1]))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2]))
suite.addTest(
Conv1DTestCase(
methodName='runTest', padding=[1, 2], data_format='NLC'))
suite.addTest(Conv1DTestCase(methodName='runTest', padding=2))
suite.addTest(Conv1DTestCase(methodName='runTest'))
suite.addTest(
......@@ -204,6 +213,12 @@ def add_error_cases(suite):
suite.addTest(
Conv1DErrorTestCase(
methodName='runTest', padding=[1, 2, 3, 4, 5]))
suite.addTest(
Conv1DErrorTestCase(
methodName='runTest', padding=[1, 2, 3, 4, 5], data_format='NLC'))
suite.addTest(
Conv1DErrorTestCase(
methodName='runTest', num_filters=512, padding=[1, 2, 3, 4, 5]))
suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10))
......
......@@ -326,21 +326,24 @@ def conv1d(x,
# update attrs
padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
if len(padding) == 2:
padding = padding + [0] * 2
padding = [0] * 2 + padding
elif len(padding) == 1:
padding = padding + [0]
padding = [0] + padding
else:
raise ValueError(
"The size of padding's dimension should be 1 or 2. But got padding={}".
format(padding))
stride = convert_to_list(stride, 1, 'stride') + [1]
dilation = convert_to_list(dilation, 1, 'dilation') + [1]
stride = [1] + convert_to_list(stride, 1, 'stride')
dilation = [1] + convert_to_list(dilation, 1, 'dilation')
weight = unsqueeze(weight, axis=[-2])
l_type = "conv2d"
if (num_channels == groups and num_channels != 1 and
num_filters % num_channels == 0 and not use_cudnn):
# When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
if (core.is_compiled_with_cuda() and num_channels == groups and
num_channels != 1 and num_filters % num_channels == 0):
l_type = 'depthwise_conv2d'
use_cudnn = False
......@@ -351,9 +354,9 @@ def conv1d(x,
else:
l_type = 'conv2d'
squeeze_aixs = -2 if channel_last else -1
squeeze_aixs = -3 if channel_last else -2
x = unsqueeze(x, axis=[squeeze_aixs])
weight = unsqueeze(weight, axis=[-1])
if in_dygraph_mode():
attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册