diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 2ef29eaf5f2cc25bea19a4f73b2fc51cab39073f..6744b1781943f73ac15e080d5e8ff2e3ef6db35c 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -221,39 +221,42 @@ static int countNonZero_(const T* src, int len ) return nz; } -template <> -int countNonZero_ (const uchar* src, int len) +static int countNonZero8u( const uchar* src, int len ) { - int i=0, nz = 0; - #if (defined CV_SSE4_2 && CV_SSE4_2 && (_WIN64 || __amd64__)) - if(USE_SSE4_2)//5x-6x - { - __m128i pattern = _mm_setzero_si128 (); - __m128i inv = _mm_set1_epi8((char)1); - __int64 CV_DECL_ALIGNED(16) buf[2]; - for (; i<=len-16; i+=16) - { - __m128i r0 = _mm_lddqu_si128((const __m128i*)(src+i)); - __m128i res = _mm_cmpeq_epi8(r0, pattern); - res = _mm_add_epi8(res, inv);//11111111+1=00000000, 00000000+1=00000001 - _mm_store_si128 ((__m128i*)buf, res); - - __int64 countLow = _mm_popcnt_u64(buf[0]); - nz += countLow; - - __int64 countHigh = _mm_popcnt_u64(buf[1]); - nz +=countHigh; - } - } - #endif - for( ; i < len; i++ ) - nz += src[i] != 0; + int i=0, nz = 0; +#if CV_SSE2 + if(USE_SSE2)//5x-6x + { + __m128i pattern = _mm_setzero_si128 (); + static uchar tab[256]; + static volatile bool initialized = false; + if( !initialized ) + { + // we compute inverse popcount table, + // since we pass (img[x] == 0) mask as index in the table. + for( int j = 0; j < 256; j++ ) + { + int val = 0; + for( int mask = 1; mask < 256; mask += mask ) + val += (j & mask) == 0; + tab[j] = (uchar)val; + } + initialized = true; + } + + for (; i<=len-16; i+=16) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)(src+i)); + int val = _mm_movemask_epi8(_mm_cmpeq_epi8(r0, pattern)); + nz += tab[val & 255] + tab[val >> 8]; + } + } +#endif + for( ; i < len; i++ ) + nz += src[i] != 0; return nz; } -static int countNonZero8u( const uchar* src, int len ) -{ return countNonZero_(src, len); } - static int countNonZero16u( const ushort* src, int len ) { return countNonZero_(src, len); }