diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 3d865408ebb3d75db06513556619129ab8ff8637..43200218dcb7bcd8f9681e231c97a3f08ac3f666 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -213,7 +213,7 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs } // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16); + bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); dx1 = std::max(anchor.x - roi.x, 0); dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp index c724e3eb361a1588b6b11bc653156526aa8008d4..cb25a50c7bc4649538d89b41fea5d0dbad044808 100644 --- a/modules/imgproc/src/morph.cpp +++ b/modules/imgproc/src/morph.cpp @@ -45,6 +45,7 @@ #include "opencl_kernels_imgproc.hpp" #include #include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" #include /****************************************************************************************\ @@ -97,73 +98,65 @@ struct MorphNoVec int operator()(uchar**, int, uchar*, int) const { return 0; } }; -#if CV_SSE2 +#if CV_SIMD -template struct MorphRowIVec +template struct MorphRowVec { - enum { ESZ = VecUpdate::ESZ }; - - MorphRowIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} int operator()(const uchar* src, uchar* dst, int width, int cn) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - - cn *= ESZ; int i, k, _ksize = ksize*cn; - width = (width & -4)*cn; + width *= cn; VecUpdate updateOp; - for( i = 0; i <= width - 16; i += 16 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) { - __m128i s = _mm_loadu_si128((const __m128i*)(src + i)); - for( k = cn; k < _ksize; k += cn ) + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); + vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); + vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); + for (k = cn; k < _ksize; k += cn) { - __m128i x = _mm_loadu_si128((const __m128i*)(src + i + k)); - s = updateOp(s, x); + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); + s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); } - _mm_storeu_si128((__m128i*)(dst + i), s); + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + v_store((stype*)dst + i + 2*vtype::nlanes, s2); + v_store((stype*)dst + i + 3*vtype::nlanes, s3); } - - for( ; i < width; i += 4 ) + if( i <= width - 2*vtype::nlanes ) { - __m128i s = _mm_cvtsi32_si128(*(const int*)(src + i)); + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); for( k = cn; k < _ksize; k += cn ) { - __m128i x = _mm_cvtsi32_si128(*(const int*)(src + i + k)); - s = updateOp(s, x); + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); } - *(int*)(dst + i) = _mm_cvtsi128_si32(s); + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; } - - return i/ESZ; - } - - int ksize, anchor; -}; - - -template struct MorphRowFVec -{ - MorphRowFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar* src, uchar* dst, int width, int cn) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int i, k, _ksize = ksize*cn; - width = (width & -4)*cn; - VecUpdate updateOp; - - for( i = 0; i < width; i += 4 ) + if( i <= width - vtype::nlanes ) { - __m128 s = _mm_loadu_ps((const float*)src + i); + vtype s = vx_load((const stype*)src + i); for( k = cn; k < _ksize; k += cn ) - { - __m128 x = _mm_loadu_ps((const float*)src + i + k); - s = updateOp(s, x); - } - _mm_storeu_ps((float*)dst + i, s); + s = updateOp(s, vx_load((const stype*)src + i + k)); + v_store((stype*)dst + i, s); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s = vx_load_low((const stype*)src + i); + for( k = cn; k < _ksize; k += cn ) + s = updateOp(s, vx_load_low((const stype*)src + i + k)); + v_store_low((stype*)dst + i, s); + i += vtype::nlanes/2; } return i; @@ -173,230 +166,156 @@ template struct MorphRowFVec }; -template struct MorphColumnIVec +template struct MorphColumnVec { - enum { ESZ = VecUpdate::ESZ }; - - MorphColumnIVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** src, uchar* dst, int dststep, int count, int width) const + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - int i = 0, k, _ksize = ksize; - width *= ESZ; VecUpdate updateOp; for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)src[i] & 15) == 0 ); + CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); + + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; + dststep /= sizeof(dst[0]); for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) { - for( i = 0; i <= width - 32; i += 32 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) { - const uchar* sptr = src[1] + i; - __m128i s0 = _mm_load_si128((const __m128i*)sptr); - __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); for( k = 2; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); } sptr = src[0] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - _mm_storeu_si128((__m128i*)(dst + i), updateOp(s0, x0)); - _mm_storeu_si128((__m128i*)(dst + i + 16), updateOp(s1, x1)); + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - _mm_storeu_si128((__m128i*)(dst + dststep + i), updateOp(s0, x0)); - _mm_storeu_si128((__m128i*)(dst + dststep + i + 16), updateOp(s1, x1)); + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); } - - for( ; i <= width - 8; i += 8 ) + if( i <= width - 2*vtype::nlanes ) { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[1] + i)), x0; + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); for( k = 2; k < _ksize; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); - } - - x0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)); - _mm_storel_epi64((__m128i*)(dst + i), updateOp(s0, x0)); - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - _mm_storel_epi64((__m128i*)(dst + dststep + i), updateOp(s0, x0)); - } - } - - for( ; count > 0; count--, dst += dststep, src++ ) - { - for( i = 0; i <= width - 32; i += 32 ) - { - const uchar* sptr = src[0] + i; - __m128i s0 = _mm_load_si128((const __m128i*)sptr); - __m128i s1 = _mm_load_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; - - for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_si128((const __m128i*)sptr); - x1 = _mm_load_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); } - _mm_storeu_si128((__m128i*)(dst + i), s0); - _mm_storeu_si128((__m128i*)(dst + i + 16), s1); - } - for( ; i <= width - 8; i += 8 ) - { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0; + sptr = src[0] + i; + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - for( k = 1; k < _ksize; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); - } - _mm_storel_epi64((__m128i*)(dst + i), s0); + sptr = src[k] + i; + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + i += 2*vtype::nlanes; } - } - - return i/ESZ; - } - - int ksize, anchor; -}; - - -template struct MorphColumnFVec -{ - MorphColumnFVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - int i = 0, k, _ksize = ksize; - VecUpdate updateOp; - - for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & 15) == 0 ); - - const float** src = (const float**)_src; - float* dst = (float*)_dst; - dststep /= sizeof(dst[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) - { - for( i = 0; i <= width - 16; i += 16 ) + if( i <= width - vtype::nlanes ) { - const float* sptr = src[1] + i; - __m128 s0 = _mm_load_ps(sptr); - __m128 s1 = _mm_load_ps(sptr + 4); - __m128 s2 = _mm_load_ps(sptr + 8); - __m128 s3 = _mm_load_ps(sptr + 12); - __m128 x0, x1, x2, x3; + vtype s0 = vx_load_aligned(src[1] + i); for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); - } - - sptr = src[0] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - _mm_storeu_ps(dst + i, updateOp(s0, x0)); - _mm_storeu_ps(dst + i + 4, updateOp(s1, x1)); - _mm_storeu_ps(dst + i + 8, updateOp(s2, x2)); - _mm_storeu_ps(dst + i + 12, updateOp(s3, x3)); + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0)); - _mm_storeu_ps(dst + dststep + i + 4, updateOp(s1, x1)); - _mm_storeu_ps(dst + dststep + i + 8, updateOp(s2, x2)); - _mm_storeu_ps(dst + dststep + i + 12, updateOp(s3, x3)); + v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); + i += vtype::nlanes; } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - vtype::nlanes/2 ) { - __m128 s0 = _mm_load_ps(src[1] + i), x0; + vtype s0 = vx_load_low(src[1] + i); for( k = 2; k < _ksize; k++ ) - { - x0 = _mm_load_ps(src[k] + i); - s0 = updateOp(s0, x0); - } + s0 = updateOp(s0, vx_load_low(src[k] + i)); - x0 = _mm_load_ps(src[0] + i); - _mm_storeu_ps(dst + i, updateOp(s0, x0)); - x0 = _mm_load_ps(src[k] + i); - _mm_storeu_ps(dst + dststep + i, updateOp(s0, x0)); + v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); + v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); + i += vtype::nlanes/2; } } for( ; count > 0; count--, dst += dststep, src++ ) { - for( i = 0; i <= width - 16; i += 16 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) { - const float* sptr = src[0] + i; - __m128 s0 = _mm_load_ps(sptr); - __m128 s1 = _mm_load_ps(sptr + 4); - __m128 s2 = _mm_load_ps(sptr + 8); - __m128 s3 = _mm_load_ps(sptr + 12); - __m128 x0, x1, x2, x3; + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); for( k = 1; k < _ksize; k++ ) { sptr = src[k] + i; - x0 = _mm_load_ps(sptr); - x1 = _mm_load_ps(sptr + 4); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - x2 = _mm_load_ps(sptr + 8); - x3 = _mm_load_ps(sptr + 12); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); } - - for( i = 0; i <= width - 4; i += 4 ) + if( i <= width - 2*vtype::nlanes ) { - __m128 s0 = _mm_load_ps(src[0] + i), x0; + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + for( k = 1; k < _ksize; k++ ) { - x0 = _mm_load_ps(src[k] + i); - s0 = updateOp(s0, x0); + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s0 = vx_load_aligned(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s0 = vx_load_low(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; } } @@ -407,185 +326,109 @@ template struct MorphColumnFVec }; -template struct MorphIVec +template struct MorphVec { - enum { ESZ = VecUpdate::ESZ }; - - int operator()(uchar** src, int nz, uchar* dst, int width) const + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + int operator()(uchar** _src, int nz, uchar* _dst, int width) const { - if( !checkHardwareSupport(CV_CPU_SSE2) ) - return 0; - + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; int i, k; - width *= ESZ; VecUpdate updateOp; - for( i = 0; i <= width - 32; i += 32 ) + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) { - const uchar* sptr = src[0] + i; - __m128i s0 = _mm_loadu_si128((const __m128i*)sptr); - __m128i s1 = _mm_loadu_si128((const __m128i*)(sptr + 16)); - __m128i x0, x1; - + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); + vtype s2 = vx_load(sptr + 2*vtype::nlanes); + vtype s3 = vx_load(sptr + 3*vtype::nlanes); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; - x0 = _mm_loadu_si128((const __m128i*)sptr); - x1 = _mm_loadu_si128((const __m128i*)(sptr + 16)); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - } - _mm_storeu_si128((__m128i*)(dst + i), s0); - _mm_storeu_si128((__m128i*)(dst + i + 16), s1); - } - - for( ; i <= width - 8; i += 8 ) - { - __m128i s0 = _mm_loadl_epi64((const __m128i*)(src[0] + i)), x0; - - for( k = 1; k < nz; k++ ) - { - x0 = _mm_loadl_epi64((const __m128i*)(src[k] + i)); - s0 = updateOp(s0, x0); + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); } - _mm_storel_epi64((__m128i*)(dst + i), s0); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); } - - return i/ESZ; - } -}; - - -template struct MorphFVec -{ - int operator()(uchar** _src, int nz, uchar* _dst, int width) const - { - if( !checkHardwareSupport(CV_CPU_SSE) ) - return 0; - - const float** src = (const float**)_src; - float* dst = (float*)_dst; - int i, k; - VecUpdate updateOp; - - for( i = 0; i <= width - 16; i += 16 ) + if( i <= width - 2*vtype::nlanes ) { - const float* sptr = src[0] + i; - __m128 s0 = _mm_loadu_ps(sptr); - __m128 s1 = _mm_loadu_ps(sptr + 4); - __m128 s2 = _mm_loadu_ps(sptr + 8); - __m128 s3 = _mm_loadu_ps(sptr + 12); - __m128 x0, x1, x2, x3; - + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); for( k = 1; k < nz; k++ ) { sptr = src[k] + i; - x0 = _mm_loadu_ps(sptr); - x1 = _mm_loadu_ps(sptr + 4); - x2 = _mm_loadu_ps(sptr + 8); - x3 = _mm_loadu_ps(sptr + 12); - s0 = updateOp(s0, x0); - s1 = updateOp(s1, x1); - s2 = updateOp(s2, x2); - s3 = updateOp(s3, x3); + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); } - _mm_storeu_ps(dst + i, s0); - _mm_storeu_ps(dst + i + 4, s1); - _mm_storeu_ps(dst + i + 8, s2); - _mm_storeu_ps(dst + i + 12, s3); + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; } - - for( ; i <= width - 4; i += 4 ) + if( i <= width - vtype::nlanes ) { - __m128 s0 = _mm_loadu_ps(src[0] + i), x0; - + vtype s0 = vx_load(src[0] + i); for( k = 1; k < nz; k++ ) - { - x0 = _mm_loadu_ps(src[k] + i); - s0 = updateOp(s0, x0); - } - _mm_storeu_ps(dst + i, s0); + s0 = updateOp(s0, vx_load(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; } - - for( ; i < width; i++ ) + if( i <= width - vtype::nlanes/2 ) { - __m128 s0 = _mm_load_ss(src[0] + i), x0; - + vtype s0 = vx_load_low(src[0] + i); for( k = 1; k < nz; k++ ) - { - x0 = _mm_load_ss(src[k] + i); - s0 = updateOp(s0, x0); - } - _mm_store_ss(dst + i, s0); + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; } - return i; } }; -struct VMin8u -{ - enum { ESZ = 1 }; - __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_min_epu8(a,b); } -}; -struct VMax8u -{ - enum { ESZ = 1 }; - __m128i operator()(const __m128i& a, const __m128i& b) const { return _mm_max_epu8(a,b); } -}; -struct VMin16u -{ - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_subs_epu16(a,_mm_subs_epu16(a,b)); } -}; -struct VMax16u -{ - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_adds_epu16(_mm_subs_epu16(a,b), b); } -}; -struct VMin16s +template struct VMin { - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_min_epi16(a, b); } + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } }; -struct VMax16s +template struct VMax { - enum { ESZ = 2 }; - __m128i operator()(const __m128i& a, const __m128i& b) const - { return _mm_max_epi16(a, b); } + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } }; -struct VMin32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_min_ps(a,b); }}; -struct VMax32f { __m128 operator()(const __m128& a, const __m128& b) const { return _mm_max_ps(a,b); }}; - -typedef MorphRowIVec ErodeRowVec8u; -typedef MorphRowIVec DilateRowVec8u; -typedef MorphRowIVec ErodeRowVec16u; -typedef MorphRowIVec DilateRowVec16u; -typedef MorphRowIVec ErodeRowVec16s; -typedef MorphRowIVec DilateRowVec16s; -typedef MorphRowFVec ErodeRowVec32f; -typedef MorphRowFVec DilateRowVec32f; - -typedef MorphColumnIVec ErodeColumnVec8u; -typedef MorphColumnIVec DilateColumnVec8u; -typedef MorphColumnIVec ErodeColumnVec16u; -typedef MorphColumnIVec DilateColumnVec16u; -typedef MorphColumnIVec ErodeColumnVec16s; -typedef MorphColumnIVec DilateColumnVec16s; -typedef MorphColumnFVec ErodeColumnVec32f; -typedef MorphColumnFVec DilateColumnVec32f; - -typedef MorphIVec ErodeVec8u; -typedef MorphIVec DilateVec8u; -typedef MorphIVec ErodeVec16u; -typedef MorphIVec DilateVec16u; -typedef MorphIVec ErodeVec16s; -typedef MorphIVec DilateVec16s; -typedef MorphFVec ErodeVec32f; -typedef MorphFVec DilateVec32f; + +typedef MorphRowVec > ErodeRowVec8u; +typedef MorphRowVec > DilateRowVec8u; +typedef MorphRowVec > ErodeRowVec16u; +typedef MorphRowVec > DilateRowVec16u; +typedef MorphRowVec > ErodeRowVec16s; +typedef MorphRowVec > DilateRowVec16s; +typedef MorphRowVec > ErodeRowVec32f; +typedef MorphRowVec > DilateRowVec32f; + +typedef MorphColumnVec > ErodeColumnVec8u; +typedef MorphColumnVec > DilateColumnVec8u; +typedef MorphColumnVec > ErodeColumnVec16u; +typedef MorphColumnVec > DilateColumnVec16u; +typedef MorphColumnVec > ErodeColumnVec16s; +typedef MorphColumnVec > DilateColumnVec16s; +typedef MorphColumnVec > ErodeColumnVec32f; +typedef MorphColumnVec > DilateColumnVec32f; + +typedef MorphVec > ErodeVec8u; +typedef MorphVec > DilateVec8u; +typedef MorphVec > ErodeVec16u; +typedef MorphVec > DilateVec16u; +typedef MorphVec > ErodeVec16s; +typedef MorphVec > DilateVec16s; +typedef MorphVec > ErodeVec32f; +typedef MorphVec > DilateVec32f; #else