提交 a2a8ba17 编写于 作者: I Ilya Lavrenov

compare

上级 8d48632e
......@@ -3268,6 +3268,130 @@ struct Cmp_SIMD<float>
uint8x8_t v_mask;
};
#elif CV_SSE2
template <>
struct Cmp_SIMD<schar>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
CV_Assert(code == CMP_GT || code == CMP_LE ||
code == CMP_EQ || code == CMP_NE);
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
v_mask = _mm_set1_epi8(0xff);
}
int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
{
int x = 0;
if (!haveSSE)
return x;
if (code == CMP_GT)
for ( ; x <= width - 16; x += 16)
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x))));
else if (code == CMP_LE)
for ( ; x <= width - 16; x += 16)
{
__m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
}
else if (code == CMP_EQ)
for ( ; x <= width - 16; x += 16)
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x))));
else if (code == CMP_NE)
for ( ; x <= width - 16; x += 16)
{
__m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
}
return x;
}
int code;
__m128i v_mask;
bool haveSSE;
};
template <>
struct Cmp_SIMD<int>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
CV_Assert(code == CMP_GT || code == CMP_LE ||
code == CMP_EQ || code == CMP_NE);
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
v_mask = _mm_set1_epi32(0xffffffff);
}
int operator () (const int * src1, const int * src2, uchar * dst, int width) const
{
int x = 0;
if (!haveSSE)
return x;
if (code == CMP_GT)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
}
else if (code == CMP_LE)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
}
else if (code == CMP_EQ)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
}
else if (code == CMP_NE)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
}
return x;
}
int code;
__m128i v_mask;
bool haveSSE;
};
#endif
template<typename T> static void
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册