Merge pull request #23528 from WanliZhong:issue23278

DNN/CUDA: make 'abcd op 1b11' broadcast eltwise operator support cuda

Merge pull request #23528 from WanliZhong:issue23278
DNN/CUDA: make 'abcd op 1b11' broadcast eltwise operator support cuda
e3e1f704 · Alexander Smorkalov · GitHub · aa57833a · e4360294 · e3e1f704
隐藏空白更改
内联并排

Showing with 20 addition and 5 deletion

modules/dnn/perf/perf_layer.cpp modules/dnn/perf/perf_layer.cpp +12 -2

modules/dnn/src/layers/nary_eltwise_layers.cpp modules/dnn/src/layers/nary_eltwise_layers.cpp +8 -3

未找到文件。
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -66,8 +66,13 @@ struct Layer_NaryEltwise : public TestBaseWithParam<tuple<Backend, Target> >

        if (!isRef && backendId == DNN_BACKEND_CUDA)
        {
-            if (a_shape != b_shape)
-                throw SkipTestException("The test is skipped because inputs with different shapes are not supported.");
+            if (a_shape.size() != b_shape.size())
+                throw SkipTestException("The test is skipped because inputs with different shape size are not supported.");
+
+            for(int i = 0; i < a_shape.size(); i++)
+                if (a_shape[i] != b_shape[i] && a_shape[i] != 1 && b_shape[i] != 1)
+                    throw SkipTestException("The test is skipped because inputs are not supported.");
+
            if (nary_eltwise_cuda_deny_ops.find(op) != nary_eltwise_cuda_deny_ops.end())
                throw SkipTestException("The operator '" + op + "' is skipped because is not support with cuda currently.");
        }
@@ -215,6 +220,11 @@ PERF_TEST_P_(Layer_NaryEltwise, NHWC_C)
    test_layer({N, H, W, C}, {1, C}, "sum");
 }

+PERF_TEST_P_(Layer_NaryEltwise, NHWC_H)
+{
+    test_layer({N, H, W, C}, {1, H, 1, 1}, "sum");
+}
+
 PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
 {
    const int inputShape[4] = {1, 64, 104, 104};

--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -673,12 +673,17 @@ public:
    {
        auto context = reinterpret_cast<csl::CSLContext*>(context_);

-        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
        for (int i = 1; i < inputs.size(); i++)
        {
-            auto from_wrapper = inputs[i].dynamicCast<CUDABackendWrapper>();
-            if (input_wrapper->getShape() != from_wrapper->getShape())
+            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
+            if (input_0_shape.size() != input_i_shape.size())
                return Ptr<BackendNode>();
+            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
+            for (int j = 0; j < input_0_shape.size(); j++)
+                if (input_0_shape[j] != input_i_shape[j] &&
+                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
+                    return Ptr<BackendNode>();
        }

        cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;