[ROCM] fix depth conv2d in rocm, test=develop (#32170)

693c7629 · Qi Li · GitHub · fdf63b4e · 693c7629 · 693c7629
6 changed file
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL(
    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
+// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
+// Use depthwise_conv2d in MIOPEN to resolve this issue
+REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    depthwise_conv2d_grad_grad,
    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,

--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -919,11 +919,10 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
        batch_size * output_channels * output_height * output_width;
 #ifdef __HIPCC__
    int block_size = 256;
-    int grid_size = std::min((nums_output + block_size - 1) / block_size, 256);
 #else
    int block_size = 512;
-    int grid_size = (nums_output + block_size - 1) / block_size;
 #endif
+    int grid_size = (nums_output + block_size - 1) / block_size;

 #define check_case(c_filter_multiplier, c_stride, c_filter)                    \
  if (c_filter_multiplier == 0 ||                                              \

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1524,6 +1524,10 @@ def conv2d(input,
            not use_cudnn):
        l_type = 'depthwise_conv2d'

+    if (num_channels == groups and num_filters % num_channels == 0 and
+            core.is_compiled_with_rocm()):
+        l_type = 'depthwise_conv2d'
+
    helper = LayerHelper(l_type, **locals())
    dtype = helper.input_dtype()


--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1248,6 +1248,17 @@ create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)

+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
 create_test_cudnn_channel_last_fp16_class(
    TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -25,7 +25,7 @@ __all__ = [
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
+from ...fluid import core, dygraph_utils, get_flags
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
@@ -551,6 +551,13 @@ def conv2d(x,
    if (num_channels == groups and num_channels != 1 and
            num_filters % num_channels == 0):
        l_type = 'depthwise_conv2d'
+        if core.is_compiled_with_rocm():
+            use_cudnn = True
+        else:
+            use_cudnn = False
+
+    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
        use_cudnn = False

    return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -153,6 +153,13 @@ class _ConvNd(layers.Layer):
                                          in_channels != 1 and
                                          out_channels % in_channels == 0):
            self._op_type = 'depthwise_conv2d'
+            if core.is_compiled_with_rocm():
+                self._use_cudnn = True
+            else:
+                self._use_cudnn = False
+
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
            self._use_cudnn = False

    def extra_repr(self):
@@ -645,10 +652,6 @@ class Conv2D(_ConvNd):
            bias_attr=bias_attr,
            data_format=data_format)

-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
-            self._use_cudnn = False
-
    def forward(self, x):
        if self._padding_mode != 'zeros':
            x = F.pad(x,