From a2a8ba17fcba6d33fec1f78d53d3e7d2f3531181 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:28 +0300 Subject: [PATCH] compare --- modules/core/src/arithm.cpp | 124 ++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 68c8979a8d..a9bf3d7e78 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -3268,6 +3268,130 @@ struct Cmp_SIMD uint8x8_t v_mask; }; +#elif CV_SSE2 + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi8(0xff); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + CV_Assert(code == CMP_GT || code == CMP_LE || + code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi32(0xffffffff); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + #endif template static void -- GitLab