From 224bc5113b9ef7f2dcc2ab07526a72533f2586fb Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Thu, 10 Feb 2022 13:53:01 +0800
Subject: [PATCH] Modify the unsqueeze dimension of input data in conv1d NCL
 And NLC format (#38425)

* optimize conv1d forward

* add conv opt

* Optimize memory copy

* delete share data with

* set num_filters=512

* add nlc optimize

* Optimize num_filter=512 data on A100 and V100

* Fix the workspace_size size setting of filter
---
 paddle/fluid/operators/conv_cudnn_op.cu       | 45 ++++++++++---------
 .../tests/unittests/test_conv1d_layer.py      | 15 +++++++
 python/paddle/nn/functional/conv.py           | 21 +++++----
 3 files changed, 51 insertions(+), 30 deletions(-)
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index cbe78d9a25..20b1afb42f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -638,7 +638,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionBwdFilterAlgo_t filter_algo =
         static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
 #endif
-    size_t workspace_size = 0;
+    // input data workspace_size
+    size_t workspace_size_d = 0;
+    // weight workspace_size
+    size_t workspace_size_w = 0;
     int iwo_groups = groups;
     int c_groups = 1;
 
@@ -661,16 +664,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1));
+      workspace_size_d =
+          std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
       data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
-                                   workspace_size, ctx);
+                                   workspace_size_d, ctx);
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
       data_algo =
           search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+      workspace_size_d = std::max(workspace_size_d,
+                                  search1::GetWorkspaceSize(args1, data_algo));
 #endif
     }
 
@@ -686,16 +689,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      workspace_size_w =
+          std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
       filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
-                                     workspace_size, ctx);
+                                     workspace_size_w, ctx);
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
       filter_algo =
           search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-      workspace_size = std::max(workspace_size,
-                                search2::GetWorkspaceSize(args2, filter_algo));
+      workspace_size_w = std::max(
+          workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
 #endif
     }
 
@@ -726,9 +729,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       handle, &alpha, args1.odesc.desc(), output_grad_data,
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
                       data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
-                      cudnn_workspace_ptr, workspace_size));
+                      cudnn_workspace_ptr, workspace_size_d));
             },
-            workspace_size);
+            workspace_size_d);
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
             handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
             transformed_input_grad_data, &alpha, args1.idesc.desc(),
@@ -743,9 +746,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
                       data_algo, &beta, args1.idesc.desc(),
                       transformed_input_grad_data, cudnn_workspace_ptr,
-                      workspace_size));
+                      workspace_size_d));
             },
-            workspace_size);
+            workspace_size_d);
       }
 
 #else
@@ -758,10 +761,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       filter_data + i * group_offset_filter, args1.odesc.desc(),
                       output_grad_data + i * group_offset_out,
                       args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta, args1.idesc.desc(),
+                      workspace_size_d, &beta, args1.idesc.desc(),
                       transformed_input_grad_data + i * group_offset_in));
             },
-            workspace_size);
+            workspace_size_d);
       }
 #endif
       if (!is_sys_pad) {
@@ -804,9 +807,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                     handle, &alpha, args2.odesc.desc(), output_grad_data,
                     args2.idesc.desc(), input_data, args2.cdesc.desc(),
                     filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                    cudnn_workspace_ptr, workspace_size));
+                    cudnn_workspace_ptr, workspace_size_w));
           },
-          workspace_size);
+          workspace_size_w);
 #else
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
@@ -817,10 +820,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       input_data + i * group_offset_in, args2.odesc.desc(),
                       output_grad_data + i * group_offset_out,
                       args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta_filter, args2.wdesc.desc(),
+                      workspace_size_w, &beta_filter, args2.wdesc.desc(),
                       filter_grad_data + i * group_offset_filter));
             },
-            workspace_size);
+            workspace_size_w);
       }
 #endif
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index 715579c332..dc460cb16f 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -165,11 +165,20 @@ def add_cases(suite):
     suite.addTest(
         Conv1DTestCase(
             methodName='runTest', filter_size=3, padding='valid'))
+    suite.addTest(
+        Conv1DTestCase(
+            methodName='runTest', num_filters=512, padding='valid'))
+    suite.addTest(
+        Conv1DTestCase(
+            methodName='runTest', num_filters=512, padding=[1, 2]))
     suite.addTest(
         Conv1DTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1]))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2]))
+    suite.addTest(
+        Conv1DTestCase(
+            methodName='runTest', padding=[1, 2], data_format='NLC'))
     suite.addTest(Conv1DTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1DTestCase(methodName='runTest'))
     suite.addTest(
@@ -204,6 +213,12 @@ def add_error_cases(suite):
     suite.addTest(
         Conv1DErrorTestCase(
             methodName='runTest', padding=[1, 2, 3, 4, 5]))
+    suite.addTest(
+        Conv1DErrorTestCase(
+            methodName='runTest', padding=[1, 2, 3, 4, 5], data_format='NLC'))
+    suite.addTest(
+        Conv1DErrorTestCase(
+            methodName='runTest', num_filters=512, padding=[1, 2, 3, 4, 5]))
     suite.addTest(Conv1DErrorTestCase(methodName='runTest', dilation=-10))
 
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index c124ed003d..31cb91bc93 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -326,21 +326,24 @@ def conv1d(x,
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
+
     if len(padding) == 2:
-        padding = padding + [0] * 2
+        padding = [0] * 2 + padding
     elif len(padding) == 1:
-        padding = padding + [0]
+        padding = [0] + padding
     else:
         raise ValueError(
             "The size of padding's dimension should be 1 or 2. But got padding={}".
             format(padding))
-
-    stride = convert_to_list(stride, 1, 'stride') + [1]
-    dilation = convert_to_list(dilation, 1, 'dilation') + [1]
+    stride = [1] + convert_to_list(stride, 1, 'stride')
+    dilation = [1] + convert_to_list(dilation, 1, 'dilation')
+    weight = unsqueeze(weight, axis=[-2])
 
     l_type = "conv2d"
-    if (num_channels == groups and num_channels != 1 and
-            num_filters % num_channels == 0 and not use_cudnn):
+
+    # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
+    if (core.is_compiled_with_cuda() and num_channels == groups and
+            num_channels != 1 and num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -351,9 +354,9 @@ def conv1d(x,
         else:
             l_type = 'conv2d'
 
-    squeeze_aixs = -2 if channel_last else -1
+    squeeze_aixs = -3 if channel_last else -2
     x = unsqueeze(x, axis=[squeeze_aixs])
-    weight = unsqueeze(weight, axis=[-1])
+
     if in_dygraph_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-- 
GitLab