Cherry pick deconv & jetson single arch (#33387)

* fix conv2d_transpose trt bugs (#33242) * fix jetson arch when compiling with single arch (#33269)

Cherry pick deconv & jetson single arch (#33387)
* fix conv2d_transpose trt bugs (#33242) * fix jetson arch when compiling with single arch (#33269)
0549d4af · Pei Yang · GitHub · ccabafa6 · 0549d4af · 0549d4af
3 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable)
  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
    set(cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "53")
+    else()
      set(cuda_arch_bin "50")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "62")
+    else()
      set(cuda_arch_bin "60 61")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "72")
+    else()
      set(cuda_arch_bin "70")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")

--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                              static_cast<void*>(bias_data), bias_size};
-  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+  // In conv2d_transpose and depthwise_conv2d_transpose,
+  // output channels = filter_dims[1] * groups
+  auto* layer = (op_desc.Type() == "conv2d_transpose" ||
+                 op_desc.Type() == "depthwise_conv2d_transpose")
+                    ? fadd_layer(const_cast<nvinfer1::ITensor*>(X),
+                                 n_input * groups, nv_ksize, weight, bias)
+                    : fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output,
                                 nv_ksize, weight, bias);
-  PADDLE_ENFORCE_NOT_NULL(layer,
-                          platform::errors::Fatal("TensorRT create conv2d"
+  PADDLE_ENFORCE_NOT_NULL(
-                                                  " layer error."));
+      layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
+                                     " layer failed."));
  layer->setStride(nv_strides);
  layer->setPadding(nv_paddings);
  layer->setNbGroups(groups);
@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter {
    ConvertConv2d(
        engine_, op, scope, test_mode,
        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-            int n_input,                             /* Conv input maps */
            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
          auto* layer =
@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter {
    ConvertConv2d(
        engine_, op, scope, test_mode,
        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-            int n_input,                             /* Deconv output maps */
            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
          auto* layer =

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -36,6 +36,7 @@ class TensorRTSubgraphPassConvTest(InferencePassTest):
                groups=self.conv_groups,
                padding=self.conv_padding,
                bias_attr=False,
+                use_cudnn=self.use_cudnn,
                act=None)
        self.feeds = {
            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
@@ -50,6 +51,7 @@ class TensorRTSubgraphPassConvTest(InferencePassTest):
        self.conv_filter_size = 6
        self.conv_groups = 3
        self.conv_padding = [1, 1]
+        self.use_cudnn = True
    def test_check_output(self):
        if core.is_compiled_with_cuda():
@@ -65,6 +67,7 @@ class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
        self.conv_filter_size = 6
        self.conv_groups = 3
        self.conv_padding = 'VALID'
+        self.use_cudnn = True
 class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
@@ -73,6 +76,7 @@ class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
        self.conv_filter_size = 6
        self.conv_groups = 3
        self.conv_padding = 'SAME'
+        self.use_cudnn = True
 class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
@@ -81,6 +85,16 @@ class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
        self.conv_filter_size = 6
        self.conv_groups = 6
        self.conv_padding = [1, 1]
+        self.use_cudnn = False
+class TensorRTSubgraphPassDepthwiseConv2Test(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
 class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
@@ -151,6 +165,16 @@ class TensorRTSubgraphPassConvTransposeMultiGroupTest(
        self.use_cudnn = True
+class TensorRTSubgraphPassConvTranspose2Test(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 4
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
 class TensorRTSubgraphPassDepthwiseConvTransposeTest(
        TensorRTSubgraphPassConvTransposeTest):
    def set_params(self):