From 224bc5113b9ef7f2dcc2ab07526a72533f2586fb Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Thu, 10 Feb 2022 13:53:01 +0800 Subject: [PATCH] Modify the unsqueeze dimension of input data in conv1d NCL And NLC format (#38425) * optimize conv1d forward * add conv opt * Optimize memory copy * delete share data with * set num_filters=512 * add nlc optimize * Optimize num_filter=512 data on A100 and V100 * Fix the workspace_size size setting of filter --- paddle/fluid/operators/conv_cudnn_op.cu | 45 ++++++++++--------- .../tests/unittests/test_conv1d_layer.py | 15 +++++++ python/paddle/nn/functional/conv.py | 21 +++++---- 3 files changed, 51 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index cbe78d9a25..20b1afb42f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -638,7 +638,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnnConvolutionBwdFilterAlgo_t filter_algo = static_cast(0); #endif - size_t workspace_size = 0; + // input data workspace_size + size_t workspace_size_d = 0; + // weight workspace_size + size_t workspace_size_w = 0; int iwo_groups = groups; int c_groups = 1; @@ -661,16 +664,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1)); + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size, ctx); + workspace_size_d, ctx); #else using search1 = SearchAlgorithm; data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); + workspace_size_d = std::max(workspace_size_d, + search1::GetWorkspaceSize(args1, data_algo)); #endif } @@ -686,16 +689,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); + workspace_size_w = + std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size, ctx); + workspace_size_w, ctx); #else using search2 = SearchAlgorithm; filter_algo = search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, filter_algo)); + workspace_size_w = std::max( + workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); #endif } @@ -726,9 +729,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { handle, &alpha, args1.odesc.desc(), output_grad_data, args1.wdesc.desc(), filter_data, args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size)); + cudnn_workspace_ptr, workspace_size_d)); }, - workspace_size); + workspace_size_d); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), transformed_input_grad_data, &alpha, args1.idesc.desc(), @@ -743,9 +746,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { args1.wdesc.desc(), filter_data, args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(), transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size)); + workspace_size_d)); }, - workspace_size); + workspace_size_d); } #else @@ -758,10 +761,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { filter_data + i * group_offset_filter, args1.odesc.desc(), output_grad_data + i * group_offset_out, args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size, &beta, args1.idesc.desc(), + workspace_size_d, &beta, args1.idesc.desc(), transformed_input_grad_data + i * group_offset_in)); }, - workspace_size); + workspace_size_d); } #endif if (!is_sys_pad) { @@ -804,9 +807,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { handle, &alpha, args2.odesc.desc(), output_grad_data, args2.idesc.desc(), input_data, args2.cdesc.desc(), filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size)); + cudnn_workspace_ptr, workspace_size_w)); }, - workspace_size); + workspace_size_w); #else for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( @@ -817,10 +820,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { input_data + i * group_offset_in, args2.odesc.desc(), output_grad_data + i * group_offset_out, args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size, &beta_filter, args2.wdesc.desc(), + workspace_size_w, &beta_filter, args2.wdesc.desc(), filter_grad_data + i * group_offset_filter)); }, - workspace_size); + workspace_size_w); } #endif diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py index 715579c332..dc460cb16f 100644 --- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py @@ -165,11 +165,20 @@ def add_cases(suite): suite.addTest( Conv1DTestCase( methodName='runTest', filter_size=3, padding='valid')) + suite.addTest( + Conv1DTestCase( + methodName='runTest', num_filters=512, padding='valid')) + suite.addTest( + Conv1DTestCase( + methodName='runTest', num_filters=512, padding=[1, 2])) suite.addTest( Conv1DTestCase( methodName='runTest', padding=2, data_format='NLC')) suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1])) suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2])) + suite.addTest( + Conv1DTestCase( + methodName='runTest', padding=[1, 2], data_format='NLC')) suite.addTest(Conv1DTestCase(methodName='runTest', padding=2)) suite.addTest(Conv1DTestCase(methodName='runTest')) suite.addTest( @@ -204,6 +213,12 @@ def add_error_cases(suite): suite.addTest( Conv1DErrorTestCase( methodName='runTest', padding=[1, 2, 3, 4, 5])) + suite.addTest( + Conv1DErrorTestCase( + methodName='runTest', padding=[1, 2, 3, 4, 5], data_format='NLC')) + suite.addTest( + Conv1DErrorTestCase( + methodName='runTest', num_filters=512, padding=[1, 2, 3, 4, 5])) suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10)) diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index c124ed003d..31cb91bc93 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -326,21 +326,24 @@ def conv1d(x, # update attrs padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1) + if len(padding) == 2: - padding = padding + [0] * 2 + padding = [0] * 2 + padding elif len(padding) == 1: - padding = padding + [0] + padding = [0] + padding else: raise ValueError( "The size of padding's dimension should be 1 or 2. But got padding={}". format(padding)) - - stride = convert_to_list(stride, 1, 'stride') + [1] - dilation = convert_to_list(dilation, 1, 'dilation') + [1] + stride = [1] + convert_to_list(stride, 1, 'stride') + dilation = [1] + convert_to_list(dilation, 1, 'dilation') + weight = unsqueeze(weight, axis=[-2]) l_type = "conv2d" - if (num_channels == groups and num_channels != 1 and - num_filters % num_channels == 0 and not use_cudnn): + + # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance + if (core.is_compiled_with_cuda() and num_channels == groups and + num_channels != 1 and num_filters % num_channels == 0): l_type = 'depthwise_conv2d' use_cudnn = False @@ -351,9 +354,9 @@ def conv1d(x, else: l_type = 'conv2d' - squeeze_aixs = -2 if channel_last else -1 + squeeze_aixs = -3 if channel_last else -2 x = unsqueeze(x, axis=[squeeze_aixs]) - weight = unsqueeze(weight, axis=[-1]) + if in_dygraph_mode(): attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False, -- GitLab