提交 b9d7c712 编写于 作者: V Vadim Pisarevsky

made countNonZero SSE code SSE2-compliant and portable

上级 72a4f192
......@@ -221,39 +221,42 @@ static int countNonZero_(const T* src, int len )
return nz;
}
template <>
int countNonZero_ <uchar> (const uchar* src, int len)
static int countNonZero8u( const uchar* src, int len )
{
int i=0, nz = 0;
#if (defined CV_SSE4_2 && CV_SSE4_2 && (_WIN64 || __amd64__))
if(USE_SSE4_2)//5x-6x
{
__m128i pattern = _mm_setzero_si128 ();
__m128i inv = _mm_set1_epi8((char)1);
__int64 CV_DECL_ALIGNED(16) buf[2];
for (; i<=len-16; i+=16)
{
__m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i));
__m128i res = _mm_cmpeq_epi8(r0, pattern);
res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001
_mm_store_si128 ((__m128i*)buf, res);
__int64 countLow = _mm_popcnt_u64(buf[0]);
nz += countLow;
__int64 countHigh = _mm_popcnt_u64(buf[1]);
nz +=countHigh;
}
}
#endif
for( ; i < len; i++ )
nz += src[i] != 0;
int i=0, nz = 0;
#if CV_SSE2
if(USE_SSE2)//5x-6x
{
__m128i pattern = _mm_setzero_si128 ();
static uchar tab[256];
static volatile bool initialized = false;
if( !initialized )
{
// we compute inverse popcount table,
// since we pass (img[x] == 0) mask as index in the table.
for( int j = 0; j < 256; j++ )
{
int val = 0;
for( int mask = 1; mask < 256; mask += mask )
val += (j & mask) == 0;
tab[j] = (uchar)val;
}
initialized = true;
}
for (; i<=len-16; i+=16)
{
__m128i r0 = _mm_loadu_si128((const __m128i*)(src+i));
int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern));
nz += tab[val & 255] + tab[val >> 8];
}
}
#endif
for( ; i < len; i++ )
nz += src[i] != 0;
return nz;
}
static int countNonZero8u( const uchar* src, int len )
{ return countNonZero_(src, len); }
static int countNonZero16u( const ushort* src, int len )
{ return countNonZero_(src, len); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册