enable dynamic dispatching for merge4

aa535412 · Aleksei Trutnev · 5a86592e · aa535412 · aa535412 · aa535412
4 changed file
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -2686,16 +2686,8 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)

        int w = 0; // cycle counter

-    #if CV_SIMD128
-        for (; w <= width-16; w+=16)
-        {
-            v_uint8x16 a, b, c, d;
-            a = v_load(&in1[w]);
-            b = v_load(&in2[w]);
-            c = v_load(&in3[w]);
-            d = v_load(&in4[w]);
-            v_store_interleave(&out[4*w], a, b, c, d);
-        }
+    #if CV_SIMD
+        w = merge4_simd(in1, in2, in3, in4, out, width);
    #endif

        for (; w < width; w++)

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -256,6 +256,13 @@ int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                    CV_CPU_DISPATCH_MODES_ALL);
 }

+int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
+                const uchar in4[], uchar out[], const int width)
+{
+    CV_CPU_DISPATCH(merge4_simd, (in1, in2, in3, in4, out, width),
+                    CV_CPU_DISPATCH_MODES_ALL);
+}
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -196,6 +196,9 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
               uchar out[], const int width);

+int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
+                const uchar in4[], uchar out[], const int width);
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv

--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -217,6 +217,9 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                uchar out[], const int width);

+int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
+                const uchar in4[], uchar out[], const int width);
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 struct scale_tag {};
@@ -2076,6 +2079,41 @@ int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
    return x;
 }

+//-------------------------
+//
+// Fluid kernels: Merge4
+//
+//-------------------------
+
+int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
+                const uchar in4[], uchar out[], const int width)
+{
+    constexpr int nlanes = v_uint8::nlanes;
+    if (width < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= width - nlanes; x += nlanes)
+        {
+            v_uint8 a, b, c, d;
+            a = vx_load(&in1[x]);
+            b = vx_load(&in2[x]);
+            c = vx_load(&in3[x]);
+            d = vx_load(&in4[x]);
+            v_store_interleave(&out[4 * x], a, b, c, d);
+        }
+        if (x < width)
+        {
+            x = width - nlanes;
+            continue;
+        }
+        break;
+    }
+    return x;
+}
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 CV_CPU_OPTIMIZATION_NAMESPACE_END