Merge pull request #16139 from alalek:core_flip_avoid_unaligned

dff8e29f · Alexander Alekhin · 4733a19b · 8d22ac20 · dff8e29f · dff8e29f
隐藏空白更改
内联并排

Showing with 102 addition and 12 deletion

modules/core/include/opencv2/core/utility.hpp modules/core/include/opencv2/core/utility.hpp +37 -0

modules/core/src/copy.cpp modules/core/src/copy.cpp +65 -12

未找到文件。
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -514,6 +514,43 @@ static inline size_t roundUp(size_t a, unsigned int b)
    return a + b - 1 - (a + b - 1) % b;
 }

+/** @brief Alignment check of passed values
+
+Usage: `isAligned<sizeof(int)>(...)`
+
+@note Alignment(N) must be a power of 2 (2**k, 2^k)
+*/
+template<int N, typename T> static inline
+bool isAligned(const T& data)
+{
+    CV_StaticAssert((N & (N - 1)) == 0, "");  // power of 2
+    return (((size_t)data) & (N - 1)) == 0;
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1)
+{
+    return isAligned<N>((size_t)p1);
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4));
+}
+
 /** @brief Enables or disables the optimized code.

 The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,

--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -563,6 +563,12 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
    return *this;
 }

+#if CV_NEON && !defined(__aarch64__)
+#define CV_CHECK_ALIGNMENT 1
+#else
+#define CV_CHECK_ALIGNMENT 0
+#endif
+
 #if CV_SIMD128
 template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
@@ -572,6 +578,10 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
    int width_1 = width & -v_uint8x16::nlanes;
    int i, j;

+#if CV_CHECK_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T)>(src, dst));
+#endif
+
    for( ; size.height--; src += sstep, dst += dstep )
    {
        for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
@@ -585,7 +595,7 @@ template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, s
            v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
            v_store((T*)(dst + i), t1);
        }
-        if (((size_t)src|(size_t)dst) % sizeof(T) == 0)
+        if (isAligned<sizeof(T)>(src, dst))
        {
            for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
            {
@@ -620,6 +630,11 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
    int end = (int)(size.width*esz);
    int width = (end + 1)/2;

+#if CV_CHECK_ALIGNMENT
+    CV_Assert(isAligned<sizeof(T1)>(src, dst));
+    CV_Assert(isAligned<sizeof(T2)>(src, dst));
+#endif
+
    for( ; size.height--; src += sstep, dst += dstep )
    {
        for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
@@ -644,6 +659,9 @@ static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
 #if CV_SIMD
+#if CV_CHECK_ALIGNMENT
+    size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
+#endif
    if (esz == 2 * v_uint8x16::nlanes)
    {
        int end = (int)(size.width*esz);
@@ -693,15 +711,27 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
            }
        }
    }
-    else if (esz == 8)
+    else if (esz == 8
+#if CV_CHECK_ALIGNMENT
+            && isAligned<sizeof(uint64)>(alignmentMark)
+#endif
+    )
    {
        flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
    }
-    else if (esz == 4)
+    else if (esz == 4
+#if CV_CHECK_ALIGNMENT
+            && isAligned<sizeof(unsigned)>(alignmentMark)
+#endif
+    )
    {
        flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
    }
-    else if (esz == 2)
+    else if (esz == 2
+#if CV_CHECK_ALIGNMENT
+            && isAligned<sizeof(ushort)>(alignmentMark)
+#endif
+    )
    {
        flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
    }
@@ -709,7 +739,11 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
    {
        flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
    }
-    else if (esz == 24)
+    else if (esz == 24
+#if CV_CHECK_ALIGNMENT
+            && isAligned<sizeof(uint64_t)>(alignmentMark)
+#endif
+    )
    {
        int end = (int)(size.width*esz);
        int width = (end + 1)/2;
@@ -732,6 +766,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
            }
        }
    }
+#if !CV_CHECK_ALIGNMENT
    else if (esz == 12)
    {
        flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
@@ -744,8 +779,9 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
    {
        flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
    }
-    else
 #endif
+    else
+#endif // CV_SIMD
    {
        int i, j, limit = (int)(((size.width + 1)/2)*esz);
        AutoBuffer<int> _tab(size.width*esz);
@@ -779,16 +815,33 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
    {
        int i = 0;
 #if CV_SIMD
-        for( ; i <= size.width - (v_int32::nlanes * 4); i += v_int32::nlanes * 4 )
+#if CV_CHECK_ALIGNMENT
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
+#endif
        {
-            v_int32 t0 = vx_load((int*)(src0 + i));
-            v_int32 t1 = vx_load((int*)(src1 + i));
-            vx_store((int*)(dst0 + i), t1);
-            vx_store((int*)(dst1 + i), t0);
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_int32 t0 = vx_load((int*)(src0 + i));
+                v_int32 t1 = vx_load((int*)(src1 + i));
+                vx_store((int*)(dst0 + i), t1);
+                vx_store((int*)(dst1 + i), t0);
+            }
        }
+#if CV_CHECK_ALIGNMENT
+        else
+        {
+            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            {
+                v_uint8 t0 = vx_load(src0 + i);
+                v_uint8 t1 = vx_load(src1 + i);
+                vx_store(dst0 + i, t1);
+                vx_store(dst1 + i, t0);
+            }
+        }
+#endif
 #endif

-        if( ((size_t)src0|(size_t)dst0|(size_t)src1|(size_t)dst1) % sizeof(int) == 0 )
+        if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
        {
            for( ; i <= size.width - 16; i += 16 )
            {