diff --git a/CMakeLists.txt b/CMakeLists.txt index 05696527d59e32406c705f8785db749368a165f0..deed0a64851bd605156dbc427cf7b6dd39aa91cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -587,6 +587,11 @@ include(cmake/OpenCVFindMatlab.cmake) include(cmake/OpenCVDetectVTK.cmake) +if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS) + get_filename_component(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" ABSOLUTE) + get_filename_component(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" ABSOLUTE) +endif() + # ---------------------------------------------------------------------------- # Add CUDA libraries (needed for apps/tools, samples) # ---------------------------------------------------------------------------- diff --git a/cmake/templates/custom_hal.hpp.in b/cmake/templates/custom_hal.hpp.in new file mode 100644 index 0000000000000000000000000000000000000000..b298a033ec0e8bc109e43ecd85bcbc8f88d11de2 --- /dev/null +++ b/cmake/templates/custom_hal.hpp.in @@ -0,0 +1,6 @@ +#ifndef _CUSTOM_HAL_INCLUDED_ +#define _CUSTOM_HAL_INCLUDED_ + +@OPENCV_HAL_HEADERS_INCLUDES@ + +#endif diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index a3c40f56011b8a46af7b9071d46e3741ee07cc27..a8a0b23e1225b648dd0b695efcd271836bbdfb6a 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -679,89 +679,8 @@ CV_EXPORTS void setUseIPP(bool flag); //! @} core_utils -//! @addtogroup core_utils_neon -//! @{ - -#if CV_NEON - -inline int32x2_t cv_vrnd_s32_f32(float32x2_t v) -{ - static int32x2_t v_sign = vdup_n_s32(1 << 31), - v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f)); - - int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v))); - return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition))); -} - -inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) -{ - static int32x4_t v_sign = vdupq_n_s32(1 << 31), - v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); - - int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); - return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); -} - -inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v) -{ - static float32x2_t v_05 = vdup_n_f32(0.5f); - return vcvt_u32_f32(vadd_f32(v, v_05)); -} - -inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) -{ - static float32x4_t v_05 = vdupq_n_f32(0.5f); - return vcvtq_u32_f32(vaddq_f32(v, v_05)); -} - -inline float32x4_t cv_vrecpq_f32(float32x4_t val) -{ - float32x4_t reciprocal = vrecpeq_f32(val); - reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); - return reciprocal; -} - -inline float32x2_t cv_vrecp_f32(float32x2_t val) -{ - float32x2_t reciprocal = vrecpe_f32(val); - reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); - reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); - return reciprocal; -} - -inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) -{ - float32x4_t e = vrsqrteq_f32(val); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); - e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); - return e; -} - -inline float32x2_t cv_vrsqrt_f32(float32x2_t val) -{ - float32x2_t e = vrsqrte_f32(val); - e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); - e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); - return e; -} - -inline float32x4_t cv_vsqrtq_f32(float32x4_t val) -{ - return cv_vrecpq_f32(cv_vrsqrtq_f32(val)); -} - -inline float32x2_t cv_vsqrt_f32(float32x2_t val) -{ - return cv_vrecp_f32(cv_vrsqrt_f32(val)); -} - -#endif - -//! @} core_utils_neon - } // cv -#include "sse_utils.hpp" +#include "opencv2/hal/neon_utils.hpp" #endif //__OPENCV_CORE_BASE_HPP__ diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp index 4d7d7df6682965e963e1650bccee658bd77420d6..b66ade5c1724ebfa7c732272da1cd518eed3d074 100644 --- a/modules/core/include/opencv2/core/utility.hpp +++ b/modules/core/include/opencv2/core/utility.hpp @@ -277,37 +277,6 @@ execution time. */ CV_EXPORTS_W int64 getCPUTickCount(); -/** @brief Available CPU features. - -remember to keep this list identical to the one in cvdef.h -*/ -enum CpuFeatures { - CPU_MMX = 1, - CPU_SSE = 2, - CPU_SSE2 = 3, - CPU_SSE3 = 4, - CPU_SSSE3 = 5, - CPU_SSE4_1 = 6, - CPU_SSE4_2 = 7, - CPU_POPCNT = 8, - - CPU_AVX = 10, - CPU_AVX2 = 11, - CPU_FMA3 = 12, - - CPU_AVX_512F = 13, - CPU_AVX_512BW = 14, - CPU_AVX_512CD = 15, - CPU_AVX_512DQ = 16, - CPU_AVX_512ER = 17, - CPU_AVX_512IFMA512 = 18, - CPU_AVX_512PF = 19, - CPU_AVX_512VBMI = 20, - CPU_AVX_512VL = 21, - - CPU_NEON = 100 -}; - /** @brief Returns true if the specified feature is supported by the host hardware. The function returns true if the host hardware supports the specified feature. When user calls diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 6ad72461db7e2f9be277731dbe9d5f5ebe87520e..06cd7916e202c8fa4da4201331089fee6d4bfe59 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -53,1415 +53,462 @@ namespace cv { -struct NOP {}; +/****************************************************************************************\ +* logical operations * +\****************************************************************************************/ + +void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ) +{ + int scn = (int)sc.total(), cn = CV_MAT_CN(buftype); + size_t esz = CV_ELEM_SIZE(buftype); + getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0); + // unroll the scalar + if( scn < cn ) + { + CV_Assert( scn == 1 ); + size_t esz1 = CV_ELEM_SIZE1(buftype); + for( size_t i = esz1; i < esz; i++ ) + scbuf[i] = scbuf[i - esz1]; + } + for( size_t i = esz; i < blocksize*esz; i++ ) + scbuf[i] = scbuf[i - esz]; +} -#if CV_SSE2 || CV_NEON -#define FUNCTOR_TEMPLATE(name) \ - template struct name {} +enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4, + OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8, + OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14, + OCL_OP_RDIV_SCALE=15 }; -FUNCTOR_TEMPLATE(VLoadStore128); -#if CV_SSE2 -FUNCTOR_TEMPLATE(VLoadStore64); -FUNCTOR_TEMPLATE(VLoadStore128Aligned); -#if CV_AVX2 -FUNCTOR_TEMPLATE(VLoadStore256); -FUNCTOR_TEMPLATE(VLoadStore256Aligned); -#endif -#endif +#ifdef HAVE_OPENCL -#endif +static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF", + "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE", + "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 }; -template -void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) +static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, bool bitwise, int oclop, bool haveScalar ) { -#if CV_SSE2 || CV_NEON - VOp vop; -#endif - Op op; + bool haveMask = !_mask.empty(); + int srctype = _src1.type(); + int srcdepth = CV_MAT_DEPTH(srctype); + int cn = CV_MAT_CN(srctype); - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) - { - int x = 0; + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; + if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || + (!doubleSupport && srcdepth == CV_64F && !bitwise)) + return false; -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = vop(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); - r0 = vop(r0, VLoadStore128::load(src2 + x )); - r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 16/sizeof(T), r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 - -#if CV_AVX2 - // nothing -#elif CV_SSE2 - if( USE_SSE2 ) - { - for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) - { - typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); - r = vop(r, VLoadStore64::load(src2 + x)); - VLoadStore64::store(dst + x, r); - } - } -#endif + char opts[1024]; + int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); + int scalarcn = kercn == 3 ? 4 : kercn; + int rowsPerWI = d.isIntel() ? 4 : 1; -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif + sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d", + haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), + kercn, rowsPerWI); - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} + ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); + if (k.empty()) + return false; -template -void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size sz) -{ -#if CV_SSE2 || CV_NEON - Op32 op32; -#endif - Op op; + UMat src1 = _src1.getUMat(), src2; + UMat dst = _dst.getUMat(), mask = _mask.getUMat(); + + ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); + ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : + ocl::KernelArg::WriteOnly(dst, cn, kercn); + ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) + if( haveScalar ) { - int x = 0; + size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; + double buf[4] = {0,0,0,0}; -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) + if( oclop != OCL_OP_NOT ) { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 4, r1); - } - } + Mat src2sc = _src2.getMat(); + convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); } -#endif // CV_AVX2 -#if CV_NEON || CV_SSE2 -#if CV_AVX2 - if( USE_AVX2 ) - { - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); - r0 = op32(r0, VLoadStore256::load(src2 + x)); - VLoadStore256::store(dst + x, r0); - } - } -#else -#if CV_SSE2 - if( USE_SSE2 ) - { -#endif // CV_SSE2 - for( ; x <= sz.width - 8; x += 8 ) - { - typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); - typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); - r0 = op32(r0, VLoadStore128::load(src2 + x )); - r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); - VLoadStore128::store(dst + x , r0); - VLoadStore128::store(dst + x + 4, r1); - } -#if CV_SSE2 - } -#endif // CV_SSE2 -#endif // CV_AVX2 -#endif // CV_NEON || CV_SSE2 + ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); -#if CV_ENABLE_UNROLLED - for( ; x <= sz.width - 4; x += 4 ) - { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; - } -#endif + if( !haveMask ) + k.args(src1arg, dstarg, scalararg); + else + k.args(src1arg, maskarg, dstarg, scalararg); + } + else + { + src2 = _src2.getUMat(); + ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); + if( !haveMask ) + k.args(src1arg, src2arg, dstarg); + else + k.args(src1arg, src2arg, maskarg, dstarg); } + + size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, 0, false); } +#endif -template -void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size sz) +static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, const BinaryFuncC* tab, + bool bitwise, int oclop ) { -#if CV_SSE2 - Op64 op64; + const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; + int kind1 = psrc1->kind(), kind2 = psrc2->kind(); + int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); + int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); + int dims1 = psrc1->dims(), dims2 = psrc2->dims(); + Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); + Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); +#ifdef HAVE_OPENCL + bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && + dims1 <= 2 && dims2 <= 2; #endif - Op op; + bool haveMask = !_mask.empty(), haveScalar = false; + BinaryFuncC func; - for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), - src2 = (const T *)((const uchar *)src2 + step2), - dst = (T *)((uchar *)dst + step) ) + if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) { - int x = 0; + _dst.create(sz1, type1); + CV_OCL_RUN(use_opencl, + ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false)) -#if CV_AVX2 - if( USE_AVX2 ) - { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) - { - for( ; x <= sz.width - 4; x += 4 ) - { - typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); - r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); - VLoadStore256Aligned::store(dst + x, r0); - } - } - } -#elif CV_SSE2 - if( USE_SSE2 ) + if( bitwise ) { - if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) - { - for( ; x <= sz.width - 4; x += 4 ) - { - typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); - typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); - r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); - r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); - VLoadStore128Aligned::store(dst + x , r0); - VLoadStore128Aligned::store(dst + x + 2, r1); - } - } + func = *tab; + cn = (int)CV_ELEM_SIZE(type1); } -#endif + else + func = tab[depth1]; - for( ; x <= sz.width - 4; x += 4 ) + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); + Size sz = getContinuousSize(src1, src2, dst); + size_t len = sz.width*(size_t)cn; + if( len == (size_t)(int)len ) { - T v0 = op(src1[x], src2[x]); - T v1 = op(src1[x+1], src2[x+1]); - dst[x] = v0; dst[x+1] = v1; - v0 = op(src1[x+2], src2[x+2]); - v1 = op(src1[x+3], src2[x+3]); - dst[x+2] = v0; dst[x+3] = v1; + sz.width = (int)len; + func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, 0); + return; } - - for( ; x < sz.width; x++ ) - dst[x] = op(src1[x], src2[x]); - } -} - -#if CV_AVX2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ - } - -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } - -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & b) const \ - { \ - body; \ - } \ } -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ - template<> \ - struct name \ - { \ - VLoadStore256::reg_type operator()( \ - const VLoadStore256::reg_type & a, \ - const VLoadStore256::reg_type & ) const \ - { \ - body; \ - } \ + if( oclop == OCL_OP_NOT ) + haveScalar = true; + else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || + !psrc1->sameSize(*psrc2) || type1 != type2 ) + { + if( checkScalar(*psrc1, type2, kind1, kind2) ) + { + // src1 is a scalar; swap it with src2 + swap(psrc1, psrc2); + swap(type1, type2); + swap(depth1, depth2); + swap(cn, cn2); + swap(sz1, sz2); + } + else if( !checkScalar(*psrc2, type1, kind2, kind1) ) + CV_Error( CV_StsUnmatchedSizes, + "The operation is neither 'array op array' (where arrays have the same size and type), " + "nor 'array op scalar', nor 'scalar op array'" ); + haveScalar = true; } - -FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); -FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); - - -static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, - 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, - 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m256i d = _mm256_subs_epi8(a, b); - __m256i m = _mm256_cmpgt_epi8(b, a); - return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m256i M = _mm256_max_epi16(a, b); - __m256i m = _mm256_min_epi16(a, b); - return _mm256_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m256i d = _mm256_sub_epi32(a, b); - __m256i m = _mm256_cmpgt_epi32(b, a); - return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); - -#elif CV_SSE2 - -#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ - static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + else + { + CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 ); } -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p); } \ - static void store(template_arg * p, reg_type v) { store_body (p, v); } \ - } + size_t esz = CV_ELEM_SIZE(type1); + size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; + BinaryFunc copymask = 0; + bool reallocate = false; -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & b) const \ - { \ - body; \ - } \ + if( haveMask ) + { + int mtype = _mask.type(); + CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1)); + copymask = getCopyMaskFunc(esz); + reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1; } -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - const VLoadStore128::reg_type & a, \ - const VLoadStore128::reg_type & ) const \ - { \ - body; \ - } \ - } + AutoBuffer _buf; + uchar *scbuf = 0, *maskbuf = 0; -FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); -FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); -FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); - -FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); -FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); - -FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); -FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, - __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); -FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); -FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); - ); -FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); - - -static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; -static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, - return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, - __m128i d = _mm_subs_epi8(a, b); - __m128i m = _mm_cmpgt_epi8(b, a); - return _mm_subs_epi8(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, - return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, - __m128i M = _mm_max_epi16(a, b); - __m128i m = _mm_min_epi16(a, b); - return _mm_subs_epi16(M, m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, - __m128i d = _mm_sub_epi32(a, b); - __m128i m = _mm_cmpgt_epi32(b, a); - return _mm_sub_epi32(_mm_xor_si128(d, m), m); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, - return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); - ); -FUNCTOR_CLOSURE_2arg(VAbsDiff, double, - return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); - ); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); -#endif + _dst.createSameSize(*psrc1, type1); + // if this is mask operation and dst has been reallocated, + // we have to clear the destination + if( haveMask && reallocate ) + _dst.setTo(0.); -#if CV_NEON + CV_OCL_RUN(use_opencl, + ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar)) -#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ - template <> \ - struct name{ \ - typedef register_type reg_type; \ - static reg_type load(const template_arg * p) { return load_body (p);}; \ - static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ - } -#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type b) const \ - { \ - return body; \ - }; \ - } + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(); + Mat dst = _dst.getMat(), mask = _mask.getMat(); -#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ - template<> \ - struct name \ - { \ - VLoadStore128::reg_type operator()( \ - VLoadStore128::reg_type a, \ - VLoadStore128::reg_type ) const \ - { \ - return body; \ - }; \ + if( bitwise ) + { + func = *tab; + cn = (int)esz; } + else + func = tab[depth1]; -FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); -FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); -FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); -FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); -FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); -FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); - -FUNCTOR_TEMPLATE(VAdd); -FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VSub); -FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VMin); -FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); - -FUNCTOR_TEMPLATE(VMax); -FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); -FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); -FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); - -FUNCTOR_TEMPLATE(VAbsDiff); -FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); -FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); -FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); - -FUNCTOR_TEMPLATE(VAnd); -FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); -FUNCTOR_TEMPLATE(VOr); -FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); -FUNCTOR_TEMPLATE(VXor); -FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); -FUNCTOR_TEMPLATE(VNot); -FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); -#endif + if( !haveScalar ) + { + const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; + uchar* ptrs[4]; -#if CV_SSE2 || CV_NEON -#define IF_SIMD(op) op -#else -#define IF_SIMD(op) NOP -#endif + NAryMatIterator it(arrays, ptrs); + size_t total = it.size, blocksize = total; -template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const -{ return CV_FAST_CAST_8U(a + b); } -template<> inline uchar OpSub::operator ()(uchar a, uchar b) const -{ return CV_FAST_CAST_8U(a - b); } + if( blocksize*cn > INT_MAX ) + blocksize = INT_MAX/cn; -template struct OpAbsDiff -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()(T a, T b) const { return (T)std::abs(a - b); } -}; - -template<> inline short OpAbsDiff::operator ()(short a, short b) const -{ return saturate_cast(std::abs(a - b)); } - -template<> inline schar OpAbsDiff::operator ()(schar a, schar b) const -{ return saturate_cast(std::abs(a - b)); } - -template struct OpAbsDiffS -{ - typedef T type1; - typedef WT type2; - typedef T rtype; - T operator()(T a, WT b) const { return saturate_cast(std::abs(a - b)); } -}; - -template struct OpAnd -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a & b; } -}; - -template struct OpOr -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a | b; } -}; - -template struct OpXor -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T b ) const { return a ^ b; } -}; - -template struct OpNot -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator()( T a, T ) const { return ~a; } -}; - -#if (ARITHM_USE_IPP == 1) -static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) -{ - if( sz.height == 1 ) - step1 = step2 = step = sz.width*elemSize; -} -#endif - -static void add8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) + if( haveMask ) { - CV_IMPL_ADD(CV_IMPL_IPP); - return; + blocksize = std::min(blocksize, blocksize0); + _buf.allocate(blocksize*esz); + maskbuf = _buf; } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} -static void add8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} - -static void add16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) + for( size_t i = 0; i < it.nplanes; i++, ++it ) { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} + for( size_t j = 0; j < total; j += blocksize ) + { + int bsz = (int)MIN(total - j, blocksize); -static void add16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; + func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, bsz*cn, 1, 0 ); + if( haveMask ) + { + copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); + ptrs[3] += bsz; + } + + bsz *= (int)esz; + ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz; + } } - setIppErrorStatus(); } -#endif - (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} + else + { + const Mat* arrays[] = { &src1, &dst, &mask, 0 }; + uchar* ptrs[3]; -static void add32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} + NAryMatIterator it(arrays, ptrs); + size_t total = it.size, blocksize = std::min(total, blocksize0); -static void add32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz)); -} + _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32); + scbuf = _buf; + maskbuf = alignPtr(scbuf + blocksize*esz, 16); -static void add64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, sz); -} + convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize); -static void sub8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) + for( size_t i = 0; i < it.nplanes; i++, ++it ) { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} + for( size_t j = 0; j < total; j += blocksize ) + { + int bsz = (int)MIN(total - j, blocksize); -static void sub8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); -} + func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, bsz*cn, 1, 0 ); + if( haveMask ) + { + copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); + ptrs[2] += bsz; + } -static void sub16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; + bsz *= (int)esz; + ptrs[0] += bsz; ptrs[1] += bsz; + } } - setIppErrorStatus(); } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); } -static void sub16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) +static BinaryFuncC* getMaxTab() { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() + static BinaryFuncC maxTab[] = { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32f), (BinaryFuncC)cv::hal::max64f, + 0 + }; -static void sub32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); + return maxTab; } -static void sub32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) +static BinaryFuncC* getMinTab() { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() + static BinaryFuncC minTab[] = { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz)); -} + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32f), (BinaryFuncC)cv::hal::min64f, + 0 + }; -static void sub64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, sz); + return minTab; } -template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } -template<> inline uchar OpMax::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } - -static void max8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - uchar* s1 = (uchar*)src1; - uchar* s2 = (uchar*)src2; - uchar* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width)) - break; - s1 += step1; - s2 += step2; - d += step; - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); } -static void max8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) +void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) { - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::and8u); + binary_op(a, b, c, mask, &f, true, OCL_OP_AND); } -static void max16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) +void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - ushort* s1 = (ushort*)src1; - ushort* s2 = (ushort*)src2; - ushort* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width)) - break; - s1 = (ushort*)((uchar*)s1 + step1); - s2 = (ushort*)((uchar*)s2 + step2); - d = (ushort*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::or8u); + binary_op(a, b, c, mask, &f, true, OCL_OP_OR); } -static void max16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) +void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) { - vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::xor8u); + binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); } -static void max32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) +void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) { - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::not8u); + binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); } -static void max32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) +void cv::max( InputArray src1, InputArray src2, OutputArray dst ) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - float* s1 = (float*)src1; - float* s2 = (float*)src2; - float* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width)) - break; - s1 = (float*)((uchar*)s1 + step1); - s2 = (float*)((uchar*)s2 + step2); - d = (float*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } -static void max64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) +void cv::min( InputArray src1, InputArray src2, OutputArray dst ) { -#if ARITHM_USE_IPP == 1 - CV_IPP_CHECK() - { - double* s1 = (double*)src1; - double* s2 = (double*)src2; - double* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width)) - break; - s1 = (double*)((uchar*)s1 + step1); - s2 = (double*)((uchar*)s2 + step2); - d = (double*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp64, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, sz); + binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } -static void min8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) +void cv::max(const Mat& src1, const Mat& src2, Mat& dst) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - uchar* s1 = (uchar*)src1; - uchar* s2 = (uchar*)src2; - uchar* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_8u(s1, s2, d, sz.width)) - break; - s1 += step1; - s2 += step2; - d += step; - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } -static void min8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) +void cv::min(const Mat& src1, const Mat& src2, Mat& dst) { - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } -static void min16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) +void cv::max(const UMat& src1, const UMat& src2, UMat& dst) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - ushort* s1 = (ushort*)src1; - ushort* s2 = (ushort*)src2; - ushort* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_16u(s1, s2, d, sz.width)) - break; - s1 = (ushort*)((uchar*)s1 + step1); - s2 = (ushort*)((uchar*)s2 + step2); - d = (ushort*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } -static void min16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) +void cv::min(const UMat& src1, const UMat& src2, UMat& dst) { - vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); + OutputArray _dst(dst); + binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } -static void min32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} -static void min32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - float* s1 = (float*)src1; - float* s2 = (float*)src2; - float* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_32f(s1, s2, d, sz.width)) - break; - s1 = (float*)((uchar*)s1 + step1); - s2 = (float*)((uchar*)s2 + step2); - d = (float*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} +/****************************************************************************************\ +* add/subtract * +\****************************************************************************************/ -static void min64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) +namespace cv { -#if ARITHM_USE_IPP == 1 - CV_IPP_CHECK() - { - double* s1 = (double*)src1; - double* s2 = (double*)src2; - double* d = dst; - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - int i = 0; - for(; i < sz.height; i++) - { - if (0 > ippsMinEvery_64f(s1, s2, d, sz.width)) - break; - s1 = (double*)((uchar*)s1 + step1); - s2 = (double*)((uchar*)s2 + step2); - d = (double*)((uchar*)d + step); - } - if (i == sz.height) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - vBinOp64, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, sz); -} -static void absdiff8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) +static int actualScalarDepth(const double* data, int len) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() + int i = 0, minval = INT_MAX, maxval = INT_MIN; + for(; i < len; ++i) { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); + int ival = cvRound(data[i]); + if( ival != data[i] ) + break; + minval = MIN(minval, ival); + maxval = MAX(maxval, ival); } -#endif - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); + return i < len ? CV_64F : + minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U : + minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S : + minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U : + minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S : + CV_32S; } -static void absdiff8s( const schar* src1, size_t step1, - const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} +#ifdef HAVE_OPENCL -static void absdiff16u( const ushort* src1, size_t step1, - const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* ) +static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, int wtype, + void* usrdata, int oclop, + bool haveScalar ) { -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void absdiff16s( const short* src1, size_t step1, - const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* ) -{ - vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff32s( const int* src1, size_t step1, - const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* ) -{ - vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - -static void absdiff32f( const float* src1, size_t step1, - const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz)); -} - -static void absdiff64f( const double* src1, size_t step1, - const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* ) -{ - vBinOp64, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, sz); -} - + const ocl::Device d = ocl::Device::getDefault(); + bool doubleSupport = d.doubleFPConfig() > 0; + int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); + bool haveMask = !_mask.empty(); -static void and8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, sz)); -} + if ( (haveMask || haveScalar) && cn > 4 ) + return false; -static void or8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, sz)); -} + int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); + if (!doubleSupport) + wdepth = std::min(wdepth, CV_32F); -static void xor8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, sz)); -} + wtype = CV_MAKETYPE(wdepth, cn); + int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2); + if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F)) + return false; -static void not8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* ) -{ -#if (ARITHM_USE_IPP == 1) - CV_IPP_CHECK() - { - fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2; - if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz))) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } -#endif - (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, sz)); -} + int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); + int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1; -/****************************************************************************************\ -* logical operations * -\****************************************************************************************/ + char cvtstr[4][32], opts[1024]; + sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " + "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " + "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s", + (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), + oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), + ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), + ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), + ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), + ocl::typeToStr(wdepth), wdepth, + ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), + ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), + ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI, + oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ? + ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert"); -void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ) -{ - int scn = (int)sc.total(), cn = CV_MAT_CN(buftype); - size_t esz = CV_ELEM_SIZE(buftype); - getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0); - // unroll the scalar - if( scn < cn ) + size_t usrdata_esz = CV_ELEM_SIZE(wdepth); + const uchar* usrdata_p = (const uchar*)usrdata; + const double* usrdata_d = (const double*)usrdata; + float usrdata_f[3]; + int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE || + oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0; + if( n > 0 && wdepth == CV_32F ) { - CV_Assert( scn == 1 ); - size_t esz1 = CV_ELEM_SIZE1(buftype); - for( size_t i = esz1; i < esz; i++ ) - scbuf[i] = scbuf[i - esz1]; + for( i = 0; i < n; i++ ) + usrdata_f[i] = (float)usrdata_d[i]; + usrdata_p = (const uchar*)usrdata_f; } - for( size_t i = esz; i < blocksize*esz; i++ ) - scbuf[i] = scbuf[i - esz]; -} - - -enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4, - OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8, - OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14, - OCL_OP_RDIV_SCALE=15 }; - -#ifdef HAVE_OPENCL - -static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF", - "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE", - "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 }; - -static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, bool bitwise, int oclop, bool haveScalar ) -{ - bool haveMask = !_mask.empty(); - int srctype = _src1.type(); - int srcdepth = CV_MAT_DEPTH(srctype); - int cn = CV_MAT_CN(srctype); - - const ocl::Device d = ocl::Device::getDefault(); - bool doubleSupport = d.doubleFPConfig() > 0; - if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || - (!doubleSupport && srcdepth == CV_64F && !bitwise)) - return false; - - char opts[1024]; - int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - int scalarcn = kercn == 3 ? 4 : kercn; - int rowsPerWI = d.isIntel() ? 4 : 1; - - sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d", - haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], - bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : - ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", - bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : - ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), - bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : - ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), - kercn, rowsPerWI); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if (k.empty()) @@ -1477,19 +524,24 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( haveScalar ) { - size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; - double buf[4] = {0,0,0,0}; - - if( oclop != OCL_OP_NOT ) - { - Mat src2sc = _src2.getMat(); - convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); - } + size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn; + double buf[4]={0,0,0,0}; + Mat src2sc = _src2.getMat(); + if( !src2sc.empty() ) + convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); if( !haveMask ) - k.args(src1arg, dstarg, scalararg); + { + if(n == 0) + k.args(src1arg, dstarg, scalararg); + else if(n == 1) + k.args(src1arg, dstarg, scalararg, + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); + else + CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); + } else k.args(src1arg, maskarg, dstarg, scalararg); } @@ -1499,121 +551,176 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); if( !haveMask ) - k.args(src1arg, src2arg, dstarg); + { + if (n == 0) + k.args(src1arg, src2arg, dstarg); + else if (n == 1) + k.args(src1arg, src2arg, dstarg, + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); + else if (n == 3) + k.args(src1arg, src2arg, dstarg, + ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz), + ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), + ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); + else + CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); + } else k.args(src1arg, src2arg, maskarg, dstarg); } size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, 0, false); + return k.run(2, globalsize, NULL, false); } #endif -static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, const BinaryFunc* tab, - bool bitwise, int oclop ) +static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false, + void* usrdata=0, int oclop=-1 ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; int kind1 = psrc1->kind(), kind2 = psrc2->kind(); + bool haveMask = !_mask.empty(); + bool reallocate = false; int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); - int dims1 = psrc1->dims(), dims2 = psrc2->dims(); + int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims(); Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); #ifdef HAVE_OPENCL - bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && - dims1 <= 2 && dims2 <= 2; + bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2; #endif - bool haveMask = !_mask.empty(), haveScalar = false; - BinaryFunc func; + bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2); + bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1); - if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) + if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 && + !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) || + (_dst.fixedType() && _dst.type() == type1)) && + ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) ) { - _dst.create(sz1, type1); + _dst.createSameSize(*psrc1, type1); CV_OCL_RUN(use_opencl, - ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false)) - - if( bitwise ) - { - func = *tab; - cn = (int)CV_ELEM_SIZE(type1); - } - else - func = tab[depth1]; + ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, + (!usrdata ? type1 : std::max(depth1, CV_32F)), + usrdata, oclop, false)) Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); - Size sz = getContinuousSize(src1, src2, dst); - size_t len = sz.width*(size_t)cn; - if( len == (size_t)(int)len ) - { - sz.width = (int)len; - func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0); - return; - } + Size sz = getContinuousSize(src1, src2, dst, src1.channels()); + tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); + return; } - if( oclop == OCL_OP_NOT ) - haveScalar = true; - else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || - !psrc1->sameSize(*psrc2) || type1 != type2 ) + bool haveScalar = false, swapped12 = false; + + if( dims1 != dims2 || sz1 != sz2 || cn != cn2 || + (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) || + (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) ) { if( checkScalar(*psrc1, type2, kind1, kind2) ) { // src1 is a scalar; swap it with src2 swap(psrc1, psrc2); + swap(sz1, sz2); swap(type1, type2); swap(depth1, depth2); swap(cn, cn2); - swap(sz1, sz2); + swap(dims1, dims2); + swapped12 = true; + if( oclop == OCL_OP_SUB ) + oclop = OCL_OP_RSUB; + if ( oclop == OCL_OP_DIV_SCALE ) + oclop = OCL_OP_RDIV_SCALE; } else if( !checkScalar(*psrc2, type1, kind2, kind1) ) CV_Error( CV_StsUnmatchedSizes, - "The operation is neither 'array op array' (where arrays have the same size and type), " - "nor 'array op scalar', nor 'scalar op array'" ); + "The operation is neither 'array op array' " + "(where arrays have the same size and the same number of channels), " + "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; + CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4)); + + if (!muldiv) + { + Mat sc = psrc2->getMat(); + depth2 = actualScalarDepth(sc.ptr(), cn); + if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) ) + depth2 = CV_32F; + } + else + depth2 = CV_64F; + } + + if( dtype < 0 ) + { + if( _dst.fixedType() ) + dtype = _dst.type(); + else + { + if( !haveScalar && type1 != type2 ) + CV_Error(CV_StsBadArg, + "When the input arrays in add/subtract/multiply/divide functions have different types, " + "the output array type must be explicitly specified"); + dtype = type1; + } + } + dtype = CV_MAT_DEPTH(dtype); + + if( depth1 == depth2 && dtype == depth1 ) + wtype = dtype; + else if( !muldiv ) + { + wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S : + depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2); + wtype = std::max(wtype, dtype); + + // when the result of addition should be converted to an integer type, + // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation, + // instead of converting the other input to floating-point and then converting the operation result back to integers. + if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) ) + wtype = CV_32S; } else { - CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 ); + wtype = std::max(depth1, std::max(depth2, CV_32F)); + wtype = std::max(wtype, dtype); } - size_t esz = CV_ELEM_SIZE(type1); - size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; - BinaryFunc copymask = 0; - bool reallocate = false; + dtype = CV_MAKETYPE(dtype, cn); + wtype = CV_MAKETYPE(wtype, cn); if( haveMask ) { int mtype = _mask.type(); - CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1)); - copymask = getCopyMaskFunc(esz); - reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1; + CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) ); + reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype; } - AutoBuffer _buf; - uchar *scbuf = 0, *maskbuf = 0; - - _dst.createSameSize(*psrc1, type1); - // if this is mask operation and dst has been reallocated, - // we have to clear the destination - if( haveMask && reallocate ) + _dst.createSameSize(*psrc1, dtype); + if( reallocate ) _dst.setTo(0.); CV_OCL_RUN(use_opencl, - ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar)) + ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, + usrdata, oclop, haveScalar)) + BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); + BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); + BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); - Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(); - Mat dst = _dst.getMat(), mask = _mask.getMat(); + size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); + size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); + size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; + BinaryFunc copymask = getCopyMaskFunc(dsz); + Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat(); - if( bitwise ) - { - func = *tab; - cn = (int)esz; - } - else - func = tab[depth1]; + AutoBuffer _buf; + uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; + size_t bufesz = (cvtsrc1 ? wsz : 0) + + (cvtsrc2 || haveScalar ? wsz : 0) + + (cvtdst ? wsz : 0) + + (haveMask ? dsz : 0); + BinaryFuncC func = tab[CV_MAT_DEPTH(wtype)]; if( !haveScalar ) { @@ -1623,31 +730,62 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; - if( blocksize*cn > INT_MAX ) - blocksize = INT_MAX/cn; + if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst ) + blocksize = std::min(blocksize, blocksize0); + _buf.allocate(bufesz*blocksize + 64); + buf = _buf; + if( cvtsrc1 ) + buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + if( cvtsrc2 ) + buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + wbuf = maskbuf = buf; + if( cvtdst ) + buf = alignPtr(buf + blocksize*wsz, 16); if( haveMask ) - { - blocksize = std::min(blocksize, blocksize0); - _buf.allocate(blocksize*esz); - maskbuf = _buf; - } + maskbuf = buf; for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); - - func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 ); - if( haveMask ) + Size bszn(bsz*cn, 1); + const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; + uchar* dptr = ptrs[2]; + if( cvtsrc1 ) { - copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); - ptrs[3] += bsz; + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; + } + if( ptrs[0] == ptrs[1] ) + sptr2 = sptr1; + else if( cvtsrc2 ) + { + cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); + sptr2 = buf2; } - bsz *= (int)esz; - ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz; + if( !haveMask && !cvtdst ) + func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); + else + { + func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata ); + if( !haveMask ) + cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); + else if( !cvtdst ) + { + copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); + ptrs[3] += bsz; + } + else + { + cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); + copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); + ptrs[3] += bsz; + } + } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; } } } @@ -1659,3213 +797,285 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); - _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32); - scbuf = _buf; - maskbuf = alignPtr(scbuf + blocksize*esz, 16); - - convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize); - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - for( size_t j = 0; j < total; j += blocksize ) - { - int bsz = (int)MIN(total - j, blocksize); - - func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 ); - if( haveMask ) - { - copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); - ptrs[2] += bsz; - } - - bsz *= (int)esz; - ptrs[0] += bsz; ptrs[1] += bsz; - } - } - } -} - -static BinaryFunc* getMaxTab() -{ - static BinaryFunc maxTab[] = - { - (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s), - (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s), - (BinaryFunc)GET_OPTIMIZED(max32s), - (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f, - 0 - }; - - return maxTab; -} - -static BinaryFunc* getMinTab() -{ - static BinaryFunc minTab[] = - { - (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s), - (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s), - (BinaryFunc)GET_OPTIMIZED(min32s), - (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f, - 0 - }; - - return minTab; -} - -} - -void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) -{ - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u); - binary_op(a, b, c, mask, &f, true, OCL_OP_AND); -} - -void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) -{ - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u); - binary_op(a, b, c, mask, &f, true, OCL_OP_OR); -} - -void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) -{ - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u); - binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); -} - -void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) -{ - BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u); - binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); -} - -void cv::max( InputArray src1, InputArray src2, OutputArray dst ) -{ - binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); -} - -void cv::min( InputArray src1, InputArray src2, OutputArray dst ) -{ - binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN ); -} - -void cv::max(const Mat& src1, const Mat& src2, Mat& dst) -{ - OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); -} - -void cv::min(const Mat& src1, const Mat& src2, Mat& dst) -{ - OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); -} - -void cv::max(const UMat& src1, const UMat& src2, UMat& dst) -{ - OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); -} - -void cv::min(const UMat& src1, const UMat& src2, UMat& dst) -{ - OutputArray _dst(dst); - binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); -} - - -/****************************************************************************************\ -* add/subtract * -\****************************************************************************************/ - -namespace cv -{ - -static int actualScalarDepth(const double* data, int len) -{ - int i = 0, minval = INT_MAX, maxval = INT_MIN; - for(; i < len; ++i) - { - int ival = cvRound(data[i]); - if( ival != data[i] ) - break; - minval = MIN(minval, ival); - maxval = MAX(maxval, ival); - } - return i < len ? CV_64F : - minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U : - minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S : - minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U : - minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S : - CV_32S; -} - -#ifdef HAVE_OPENCL - -static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, int wtype, - void* usrdata, int oclop, - bool haveScalar ) -{ - const ocl::Device d = ocl::Device::getDefault(); - bool doubleSupport = d.doubleFPConfig() > 0; - int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); - bool haveMask = !_mask.empty(); - - if ( (haveMask || haveScalar) && cn > 4 ) - return false; - - int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); - if (!doubleSupport) - wdepth = std::min(wdepth, CV_32F); - - wtype = CV_MAKETYPE(wdepth, cn); - int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2); - if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F)) - return false; - - int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); - int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1; - - char cvtstr[4][32], opts[1024]; - sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " - "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " - "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s", - (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), - oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), - ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), - ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), - ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), - ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), - ocl::typeToStr(wdepth), wdepth, - ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), - ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), - ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI, - oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ? - ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert"); - - size_t usrdata_esz = CV_ELEM_SIZE(wdepth); - const uchar* usrdata_p = (const uchar*)usrdata; - const double* usrdata_d = (const double*)usrdata; - float usrdata_f[3]; - int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE || - oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0; - if( n > 0 && wdepth == CV_32F ) - { - for( i = 0; i < n; i++ ) - usrdata_f[i] = (float)usrdata_d[i]; - usrdata_p = (const uchar*)usrdata_f; - } - - ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); - if (k.empty()) - return false; - - UMat src1 = _src1.getUMat(), src2; - UMat dst = _dst.getUMat(), mask = _mask.getUMat(); - - ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); - ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : - ocl::KernelArg::WriteOnly(dst, cn, kercn); - ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); - - if( haveScalar ) - { - size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn; - double buf[4]={0,0,0,0}; - Mat src2sc = _src2.getMat(); - - if( !src2sc.empty() ) - convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); - ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); - - if( !haveMask ) - { - if(n == 0) - k.args(src1arg, dstarg, scalararg); - else if(n == 1) - k.args(src1arg, dstarg, scalararg, - ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); - else - CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); - } - else - k.args(src1arg, maskarg, dstarg, scalararg); - } - else - { - src2 = _src2.getUMat(); - ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); - - if( !haveMask ) - { - if (n == 0) - k.args(src1arg, src2arg, dstarg); - else if (n == 1) - k.args(src1arg, src2arg, dstarg, - ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); - else if (n == 3) - k.args(src1arg, src2arg, dstarg, - ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz), - ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), - ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); - else - CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); - } - else - k.args(src1arg, src2arg, maskarg, dstarg); - } - - size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, NULL, false); -} - -#endif - -static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, - void* usrdata=0, int oclop=-1 ) -{ - const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; - int kind1 = psrc1->kind(), kind2 = psrc2->kind(); - bool haveMask = !_mask.empty(); - bool reallocate = false; - int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); - int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); - int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims(); - Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); - Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); -#ifdef HAVE_OPENCL - bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2; -#endif - bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2); - bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1); - - if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 && - !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) || - (_dst.fixedType() && _dst.type() == type1)) && - ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) ) - { - _dst.createSameSize(*psrc1, type1); - CV_OCL_RUN(use_opencl, - ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, - (!usrdata ? type1 : std::max(depth1, CV_32F)), - usrdata, oclop, false)) - - Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); - Size sz = getContinuousSize(src1, src2, dst, src1.channels()); - tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata); - return; - } - - bool haveScalar = false, swapped12 = false; - - if( dims1 != dims2 || sz1 != sz2 || cn != cn2 || - (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) || - (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) ) - { - if( checkScalar(*psrc1, type2, kind1, kind2) ) - { - // src1 is a scalar; swap it with src2 - swap(psrc1, psrc2); - swap(sz1, sz2); - swap(type1, type2); - swap(depth1, depth2); - swap(cn, cn2); - swap(dims1, dims2); - swapped12 = true; - if( oclop == OCL_OP_SUB ) - oclop = OCL_OP_RSUB; - if ( oclop == OCL_OP_DIV_SCALE ) - oclop = OCL_OP_RDIV_SCALE; - } - else if( !checkScalar(*psrc2, type1, kind2, kind1) ) - CV_Error( CV_StsUnmatchedSizes, - "The operation is neither 'array op array' " - "(where arrays have the same size and the same number of channels), " - "nor 'array op scalar', nor 'scalar op array'" ); - haveScalar = true; - CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4)); - - if (!muldiv) - { - Mat sc = psrc2->getMat(); - depth2 = actualScalarDepth(sc.ptr(), cn); - if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) ) - depth2 = CV_32F; - } - else - depth2 = CV_64F; - } - - if( dtype < 0 ) - { - if( _dst.fixedType() ) - dtype = _dst.type(); - else - { - if( !haveScalar && type1 != type2 ) - CV_Error(CV_StsBadArg, - "When the input arrays in add/subtract/multiply/divide functions have different types, " - "the output array type must be explicitly specified"); - dtype = type1; - } - } - dtype = CV_MAT_DEPTH(dtype); - - if( depth1 == depth2 && dtype == depth1 ) - wtype = dtype; - else if( !muldiv ) - { - wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S : - depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2); - wtype = std::max(wtype, dtype); - - // when the result of addition should be converted to an integer type, - // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation, - // instead of converting the other input to floating-point and then converting the operation result back to integers. - if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) ) - wtype = CV_32S; - } - else - { - wtype = std::max(depth1, std::max(depth2, CV_32F)); - wtype = std::max(wtype, dtype); - } - - dtype = CV_MAKETYPE(dtype, cn); - wtype = CV_MAKETYPE(wtype, cn); - - if( haveMask ) - { - int mtype = _mask.type(); - CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) ); - reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype; - } - - _dst.createSameSize(*psrc1, dtype); - if( reallocate ) - _dst.setTo(0.); - - CV_OCL_RUN(use_opencl, - ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, - usrdata, oclop, haveScalar)) - - BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); - BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); - BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); - - size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); - size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); - size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; - BinaryFunc copymask = getCopyMaskFunc(dsz); - Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat(); - - AutoBuffer _buf; - uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; - size_t bufesz = (cvtsrc1 ? wsz : 0) + - (cvtsrc2 || haveScalar ? wsz : 0) + - (cvtdst ? wsz : 0) + - (haveMask ? dsz : 0); - BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; - - if( !haveScalar ) - { - const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; - uchar* ptrs[4]; - - NAryMatIterator it(arrays, ptrs); - size_t total = it.size, blocksize = total; - - if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst ) - blocksize = std::min(blocksize, blocksize0); - - _buf.allocate(bufesz*blocksize + 64); - buf = _buf; - if( cvtsrc1 ) - buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); - if( cvtsrc2 ) - buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); - wbuf = maskbuf = buf; - if( cvtdst ) - buf = alignPtr(buf + blocksize*wsz, 16); - if( haveMask ) - maskbuf = buf; - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - for( size_t j = 0; j < total; j += blocksize ) - { - int bsz = (int)MIN(total - j, blocksize); - Size bszn(bsz*cn, 1); - const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; - uchar* dptr = ptrs[2]; - if( cvtsrc1 ) - { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - if( ptrs[0] == ptrs[1] ) - sptr2 = sptr1; - else if( cvtsrc2 ) - { - cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); - sptr2 = buf2; - } - - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); - else - { - func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) - { - copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; - } - else - { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; - } - } - ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; - } - } - } - else - { - const Mat* arrays[] = { &src1, &dst, &mask, 0 }; - uchar* ptrs[3]; - - NAryMatIterator it(arrays, ptrs); - size_t total = it.size, blocksize = std::min(total, blocksize0); - - _buf.allocate(bufesz*blocksize + 64); - buf = _buf; - if( cvtsrc1 ) - buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); - buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); - wbuf = maskbuf = buf; - if( cvtdst ) - buf = alignPtr(buf + blocksize*wsz, 16); - if( haveMask ) - maskbuf = buf; - - convertAndUnrollScalar( src2, wtype, buf2, blocksize); - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - for( size_t j = 0; j < total; j += blocksize ) - { - int bsz = (int)MIN(total - j, blocksize); - Size bszn(bsz*cn, 1); - const uchar *sptr1 = ptrs[0]; - const uchar* sptr2 = buf2; - uchar* dptr = ptrs[1]; - - if( cvtsrc1 ) - { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - - if( swapped12 ) - std::swap(sptr1, sptr2); - - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); - else - { - func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) - { - copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; - } - else - { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; - } - } - ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; - } - } - } -} - -static BinaryFunc* getAddTab() -{ - static BinaryFunc addTab[] = - { - (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s), - (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s), - (BinaryFunc)GET_OPTIMIZED(add32s), - (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f, - 0 - }; - - return addTab; -} - -static BinaryFunc* getSubTab() -{ - static BinaryFunc subTab[] = - { - (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s), - (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s), - (BinaryFunc)GET_OPTIMIZED(sub32s), - (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f, - 0 - }; - - return subTab; -} - -static BinaryFunc* getAbsDiffTab() -{ - static BinaryFunc absDiffTab[] = - { - (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s), - (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s), - (BinaryFunc)GET_OPTIMIZED(absdiff32s), - (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f, - 0 - }; - - return absDiffTab; -} - -} - -void cv::add( InputArray src1, InputArray src2, OutputArray dst, - InputArray mask, int dtype ) -{ - arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD ); -} - -void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst, - InputArray mask, int dtype ) -{ -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra()) - { - int kind1 = _src1.kind(), kind2 = _src2.kind(); - Mat src1 = _src1.getMat(), src2 = _src2.getMat(); - bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2); - bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1); - - if (!src1Scalar && !src2Scalar && - src1.depth() == CV_8U && src2.type() == src1.type() && - src1.dims == 2 && src2.size() == src1.size() && - mask.empty()) - { - if (dtype < 0) - { - if (_dst.fixedType()) - { - dtype = _dst.depth(); - } - else - { - dtype = src1.depth(); - } - } - - dtype = CV_MAT_DEPTH(dtype); - - if (!_dst.fixedType() || dtype == _dst.depth()) - { - _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels())); - - if (dtype == CV_16S) - { - Mat dst = _dst.getMat(); - if(tegra::subtract_8u8u16s(src1, src2, dst)) - return; - } - else if (dtype == CV_32F) - { - Mat dst = _dst.getMat(); - if(tegra::subtract_8u8u32f(src1, src2, dst)) - return; - } - else if (dtype == CV_8S) - { - Mat dst = _dst.getMat(); - if(tegra::subtract_8u8u8s(src1, src2, dst)) - return; - } - } - } - } -#endif - arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB ); -} - -void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) -{ - arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF); -} - -/****************************************************************************************\ -* multiply/divide * -\****************************************************************************************/ - -namespace cv -{ - -template -struct Mul_SIMD -{ - int operator() (const T *, const T *, T *, int, WT) const - { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Mul_SIMD -{ - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); - uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1_u8(dst + x, vqmovn_u16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); - int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1_s8(dst + x, vqmovn_s16(v_dst)); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), - vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), - vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); - vst1q_u16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); - v_dst1 = vmulq_f32(v_dst1, v_scale); - float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), - vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), - vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); - vst1q_s16(dst + x, v_dst); - } - } - - return x; - } -}; - -template <> -struct Mul_SIMD -{ - int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const - { - int x = 0; - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - else - { - float32x4_t v_scale = vdupq_n_f32(scale); - for ( ; x <= width - 8; x += 8) - { - float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - v_dst1 = vmulq_f32(v_dst1, v_scale); - - float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - v_dst2 = vmulq_f32(v_dst2, v_scale); - - vst1q_f32(dst + x, v_dst1); - vst1q_f32(dst + x + 4, v_dst2); - } - } - - return x; - } -}; - -#elif CV_SSE2 - -#if CV_SSE4_1 - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), - _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale == 1.0f ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - else - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); - - v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); - } - } - - return x; - } - - bool haveSSE; -}; - -template <> -struct Mul_SIMD -{ - Mul_SIMD() - { - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const - { - int x = 0; - - if (!haveSSE) - return x; - - __m128i v_zero = _mm_setzero_si128(); - - if( scale != 1.0f ) - { - __m128 v_scale = _mm_set1_ps(scale); - for ( ; x <= width - 8; x += 8) - { - __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); - - __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); - v_dst1 = _mm_mul_ps(v_dst1, v_scale); - - __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), - _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); - v_dst2 = _mm_mul_ps(v_dst2, v_scale); - - __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); - _mm_storeu_si128((__m128i *)(dst + x), v_dsti); - } - } - - return x; - } - - bool haveSSE; -}; - -#endif - -template static void -mul_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, WT scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Mul_SIMD vop; - - if( scale == (WT)1. ) - { - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= size.width - 4; i += 4 ) - { - T t0; - T t1; - t0 = saturate_cast(src1[i ] * src2[i ]); - t1 = saturate_cast(src1[i+1] * src2[i+1]); - dst[i ] = t0; - dst[i+1] = t1; - - t0 = saturate_cast(src1[i+2] * src2[i+2]); - t1 = saturate_cast(src1[i+3] * src2[i+3]); - dst[i+2] = t0; - dst[i+3] = t1; - } - #endif - for( ; i < size.width; i++ ) - dst[i] = saturate_cast(src1[i] * src2[i]); - } - } - else - { - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - #if CV_ENABLE_UNROLLED - for(; i <= size.width - 4; i += 4 ) - { - T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); - T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); - dst[i] = t0; dst[i+1] = t1; - - t0 = saturate_cast(scale*(WT)src1[i+2]*src2[i+2]); - t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); - dst[i+2] = t0; dst[i+3] = t1; - } - #endif - for( ; i < size.width; i++ ) - dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); - } - } -} - -template -struct Div_SIMD -{ - int operator() (const T *, const T *, T *, int, double) const - { - return 0; - } -}; - -template -struct Recip_SIMD -{ - int operator() (const T *, T *, int, double) const - { - return 0; - } -}; - - -#if CV_SIMD128 - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load_expand(src1 + x); - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load_expand(src1 + x); - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src1 = v_load(src1 + x); - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); - v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src1 = v_load(src1 + x); - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1, t2, t3; - v_expand(v_src1, t0, t1); - v_expand(v_src2, t2, t3); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src1 + x); - v_int32x4 t1 = v_load(src1 + x + 4); - v_int32x4 t2 = v_load(src2 + x); - v_int32x4 t3 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - v_float32x4 f2 = v_cvt_f32(t2); - v_float32x4 f3 = v_cvt_f32(t3); - - f0 = f0 * v_scale / f2; - f1 = f1 * v_scale / f3; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t2 == v_zero, v_zero, res0); - res1 = v_select(t3 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src1 + x); - v_float32x4 f1 = v_load(src1 + x + 4); - v_float32x4 f2 = v_load(src2 + x); - v_float32x4 f3 = v_load(src2 + x + 4); - - v_float32x4 res0 = f0 * v_scale / f2; - v_float32x4 res1 = f1 * v_scale / f3; - - res0 = v_select(f2 == v_zero, v_zero, res0); - res1 = v_select(f3 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -///////////////////////// RECIPROCAL ////////////////////// - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const uchar * src2, uchar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load_expand(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const schar * src2, schar * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load_expand(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_pack_store(dst + x, res); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const ushort * src2, ushort * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_uint16x8 v_zero = v_setzero_u16(); - - for ( ; x <= width - 8; x += 8) - { - v_uint16x8 v_src2 = v_load(src2 + x); - - v_uint32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); - v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_uint16x8 res = v_pack_u(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const short * src2, short * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int16x8 v_zero = v_setzero_s16(); - - for ( ; x <= width - 8; x += 8) - { - v_int16x8 v_src2 = v_load(src2 + x); - - v_int32x4 t0, t1; - v_expand(v_src2, t0, t1); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 i0 = v_round(f0), i1 = v_round(f1); - v_int16x8 res = v_pack(i0, i1); - - res = v_select(v_src2 == v_zero, v_zero, res); - v_store(dst + x, res); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const int * src2, int * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_int32x4 v_zero = v_setzero_s32(); - - for ( ; x <= width - 8; x += 8) - { - v_int32x4 t0 = v_load(src2 + x); - v_int32x4 t1 = v_load(src2 + x + 4); - - v_float32x4 f0 = v_cvt_f32(t0); - v_float32x4 f1 = v_cvt_f32(t1); - - f0 = v_scale / f0; - f1 = v_scale / f1; - - v_int32x4 res0 = v_round(f0), res1 = v_round(f1); - - res0 = v_select(t0 == v_zero, v_zero, res0); - res1 = v_select(t1 == v_zero, v_zero, res1); - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const float * src2, float * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float32x4 v_scale = v_setall_f32((float)scale); - v_float32x4 v_zero = v_setzero_f32(); - - for ( ; x <= width - 8; x += 8) - { - v_float32x4 f0 = v_load(src2 + x); - v_float32x4 f1 = v_load(src2 + x + 4); - - v_float32x4 res0 = v_scale / f0; - v_float32x4 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 4, res1); - } - - return x; - } -}; - -#if CV_SIMD128_64F - -template <> -struct Div_SIMD -{ - bool haveSIMD; - Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src1 + x); - v_float64x2 f1 = v_load(src1 + x + 2); - v_float64x2 f2 = v_load(src2 + x); - v_float64x2 f3 = v_load(src2 + x + 2); - - v_float64x2 res0 = f0 * v_scale / f2; - v_float64x2 res1 = f1 * v_scale / f3; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -template <> -struct Recip_SIMD -{ - bool haveSIMD; - Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } - - int operator() (const double * src2, double * dst, int width, double scale) const - { - int x = 0; - - if (!haveSIMD) - return x; - - v_float64x2 v_scale = v_setall_f64(scale); - v_float64x2 v_zero = v_setzero_f64(); - - for ( ; x <= width - 4; x += 4) - { - v_float64x2 f0 = v_load(src2 + x); - v_float64x2 f1 = v_load(src2 + x + 2); - - v_float64x2 res0 = v_scale / f0; - v_float64x2 res1 = v_scale / f1; - - res0 = v_select(f0 == v_zero, v_zero, res0); - res1 = v_select(f1 == v_zero, v_zero, res1); - - v_store(dst + x, res0); - v_store(dst + x + 2, res1); - } - - return x; - } -}; - -#endif - -#endif - -template static void -div_i( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - float scale_f = (float)scale; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -div_f( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - T scale_f = (T)scale; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Div_SIMD vop; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int i = vop(src1, src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T num = src1[i], denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; - } - } -} - -template static void -recip_i( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - float scale_f = (float)scale; - - for( ; size.height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - -template static void -recip_f( const T*, size_t, const T* src2, size_t step2, - T* dst, size_t step, Size size, double scale ) -{ - T scale_f = (T)scale; - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - Recip_SIMD vop; - - for( ; size.height--; src2 += step2, dst += step ) - { - int i = vop(src2, dst, size.width, scale); - for( ; i < size.width; i++ ) - { - T denom = src2[i]; - dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; - } - } -} - - -static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale); -} - -static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - float fscale = (float)*(const double*)scale; -#if defined HAVE_IPP - CV_IPP_CHECK() - { - if (std::fabs(fscale - 1) <= FLT_EPSILON) - { - if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - mul_(src1, step1, src2, step2, dst, step, sz, fscale); -} - -static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - if( src1 ) - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); - else - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scale) -{ - recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scale) -{ - recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - -static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scale) -{ - recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); -} - - -static BinaryFunc* getMulTab() -{ - static BinaryFunc mulTab[] = - { - (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u, - (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f, - (BinaryFunc)mul64f, 0 - }; - - return mulTab; -} - -static BinaryFunc* getDivTab() -{ - static BinaryFunc divTab[] = - { - (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u, - (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f, - (BinaryFunc)div64f, 0 - }; - - return divTab; -} - -static BinaryFunc* getRecipTab() -{ - static BinaryFunc recipTab[] = - { - (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u, - (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f, - (BinaryFunc)recip64f, 0 - }; - - return recipTab; -} - -} - -void cv::multiply(InputArray src1, InputArray src2, - OutputArray dst, double scale, int dtype) -{ - arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), - true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); -} - -void cv::divide(InputArray src1, InputArray src2, - OutputArray dst, double scale, int dtype) -{ - arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE); -} - -void cv::divide(double scale, InputArray src2, - OutputArray dst, int dtype) -{ - arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE); -} - -/****************************************************************************************\ -* addWeighted * -\****************************************************************************************/ - -namespace cv -{ - -template -struct AddWeighted_SIMD -{ - int operator() (const T *, const T *, T *, int, WT, WT, WT) const - { - return 0; - } -}; - -#if CV_SSE2 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); - - __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); - __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); - - __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1)); - - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); - } - - return x; - } - - bool haveSSE2; -}; - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); - } - - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE2) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE2; -}; - -#if CV_SSE4_1 - -template <> -struct AddWeighted_SIMD -{ - AddWeighted_SIMD() - { - haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); - } - - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - if (!haveSSE4_1) - return x; - - __m128i v_zero = _mm_setzero_si128(); - __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), - v_gamma = _mm_set1_ps(gamma); - - for( ; x <= width - 8; x += 8 ) - { - __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); - __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); - - __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); - v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); - - __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); - v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), - _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); - - _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), - _mm_cvtps_epi32(v_dstf1))); - } - - return x; - } - - bool haveSSE4_1; -}; - -#endif - -#elif CV_NEON - -template <> -struct AddWeighted_SIMD -{ - int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32 (gamma); - - for( ; x <= width - 8; x += 8 ) - { - int8x8_t in1 = vld1_s8(src1 + x); - int16x8_t in1_16 = vmovl_s8(in1); - float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); - float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); - - int8x8_t in2 = vld1_s8(src2+x); - int16x8_t in2_16 = vmovl_s8(in2); - float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); - float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); - - float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); - float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); - out_f_l = vaddq_f32(out_f_l, g); - out_f_h = vaddq_f32(out_f_h, g); - - int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); - int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); - - int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); - int8x8_t out = vqmovn_s16(out_16); - - vst1_s8(dst + x, out); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); - uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); - uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); - } - - return x; - } -}; - -template <> -struct AddWeighted_SIMD -{ - int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const - { - int x = 0; - - float32x4_t g = vdupq_n_f32(gamma); - - for( ; x <= width - 8; x += 8 ) - { - int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); - - float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); - float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); - int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); - v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); - int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); - - vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); - } - - return x; - } -}; - -#endif - -template static void -addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, - T* dst, size_t step, Size size, void* _scalars ) -{ - const double* scalars = (const double*)_scalars; - WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - step /= sizeof(dst[0]); - - AddWeighted_SIMD vop; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, size.width, alpha, beta, gamma); - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - T t1 = saturate_cast(src1[x+1]*alpha + src2[x+1]*beta + gamma); - dst[x] = t0; dst[x+1] = t1; - - t0 = saturate_cast(src1[x+2]*alpha + src2[x+2]*beta + gamma); - t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); - dst[x+2] = t0; dst[x+3] = t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); - } -} - - -static void -addWeighted8u( const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, Size size, - void* _scalars ) -{ - const double* scalars = (const double*)_scalars; - float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2]; - - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - -#if CV_SSE2 - if( USE_SSE2 ) - { - __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); - __m128i z = _mm_setzero_si128(); - - for( ; x <= size.width - 8; x += 8 ) - { - __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); - __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); - - __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); - __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); - __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); - __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); - - u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); - u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); - u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); - - u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); - u = _mm_packus_epi16(u, u); - - _mm_storel_epi64((__m128i*)(dst + x), u); - } - } -#elif CV_NEON - float32x4_t g = vdupq_n_f32 (gamma); - - for( ; x <= size.width - 8; x += 8 ) - { - uint8x8_t in1 = vld1_u8(src1+x); - uint16x8_t in1_16 = vmovl_u8(in1); - float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); - float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); - - uint8x8_t in2 = vld1_u8(src2+x); - uint16x8_t in2_16 = vmovl_u8(in2); - float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); - float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); - - float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); - float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); - out_f_l = vaddq_f32(out_f_l, g); - out_f_h = vaddq_f32(out_f_h, g); - - uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); - uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); - - uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); - uint8x8_t out = vqmovn_u16(out_16); - - vst1_u8(dst+x, out); - } -#endif - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - float t0, t1; - t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; - - dst[x] = saturate_cast(t0); - dst[x+1] = saturate_cast(t1); - - t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; - t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; - - dst[x+2] = saturate_cast(t0); - dst[x+3] = saturate_cast(t1); - } - #endif - - for( ; x < size.width; x++ ) - { - float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; - dst[x] = saturate_cast(t0); - } - } -} - -static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, - schar* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, - ushort* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, - short* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, - int* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, - float* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, - double* dst, size_t step, Size sz, void* scalars ) -{ - addWeighted_(src1, step1, src2, step2, dst, step, sz, scalars); -} - -static BinaryFunc* getAddWeightedTab() -{ - static BinaryFunc addWeightedTab[] = - { - (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u), - (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f, - (BinaryFunc)addWeighted64f, 0 - }; - - return addWeightedTab; -} - -} - -void cv::addWeighted( InputArray src1, double alpha, InputArray src2, - double beta, double gamma, OutputArray dst, int dtype ) -{ - double scalars[] = {alpha, beta, gamma}; - arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW); -} - - -/****************************************************************************************\ -* compare * -\****************************************************************************************/ - -namespace cv -{ - -template -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int) - { - } - - int operator () (const T *, const T *, uchar *, int) const - { - return 0; - } -}; - -#if CV_NEON - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdupq_n_u8(255); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); - - return x; - } - - int code; - uint8x16_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, vmovn_u16(v_dst)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); - vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); - uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - v_mask = vdup_n_u8(255); - } - - int operator () (const float * src1, const float * src2, uchar * dst, int width) const - { - int x = 0; - - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); - uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); - uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); - vst1_u8(dst + x, veor_u8(v_dst, v_mask)); - } - - return x; - } - - int code; - uint8x8_t v_mask; -}; - -#elif CV_SSE2 - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi8(-1); - } - - int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; - - if (code == CMP_GT) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_LE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 16; x += 16) - _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x)))); - else if (code == CMP_NE) - for ( ; x <= width - 16; x += 16) - { - __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); - } - - return x; - } - - int code; - __m128i v_mask; - bool haveSSE; -}; - -template <> -struct Cmp_SIMD -{ - explicit Cmp_SIMD(int code_) : - code(code_) - { - CV_Assert(code == CMP_GT || code == CMP_LE || - code == CMP_EQ || code == CMP_NE); - - haveSSE = checkHardwareSupport(CV_CPU_SSE2); - - v_mask = _mm_set1_epi32(0xffffffff); - } - - int operator () (const int * src1, const int * src2, uchar * dst, int width) const - { - int x = 0; - - if (!haveSSE) - return x; + _buf.allocate(bufesz*blocksize + 64); + buf = _buf; + if( cvtsrc1 ) + buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); + wbuf = maskbuf = buf; + if( cvtdst ) + buf = alignPtr(buf + blocksize*wsz, 16); + if( haveMask ) + maskbuf = buf; - if (code == CMP_GT) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + convertAndUnrollScalar( src2, wtype, buf2, blocksize); - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_LE) - for ( ; x <= width - 8; x += 8) + for( size_t i = 0; i < it.nplanes; i++, ++it ) + { + for( size_t j = 0; j < total; j += blocksize ) { - __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + int bsz = (int)MIN(total - j, blocksize); + Size bszn(bsz*cn, 1); + const uchar *sptr1 = ptrs[0]; + const uchar* sptr2 = buf2; + uchar* dptr = ptrs[1]; - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); - } - else if (code == CMP_EQ) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + if( cvtsrc1 ) + { + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; + } - _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); - } - else if (code == CMP_NE) - for ( ; x <= width - 8; x += 8) - { - __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), - _mm_loadu_si128((const __m128i *)(src2 + x))); - __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), - _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + if( swapped12 ) + std::swap(sptr1, sptr2); - _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + if( !haveMask && !cvtdst ) + func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); + else + { + func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata ); + if( !haveMask ) + cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); + else if( !cvtdst ) + { + copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); + ptrs[2] += bsz; + } + else + { + cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); + copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); + ptrs[2] += bsz; + } + } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; } - - return x; + } } +} - int code; - __m128i v_mask; - bool haveSSE; -}; +static BinaryFuncC* getAddTab() +{ + static BinaryFuncC addTab[] = + { + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32f), (BinaryFuncC)cv::hal::add64f, + 0 + }; -#endif + return addTab; +} -template static void -cmp_(const T* src1, size_t step1, const T* src2, size_t step2, - uchar* dst, size_t step, Size size, int code) +static BinaryFuncC* getSubTab() { - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) + static BinaryFuncC subTab[] = { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32f), (BinaryFuncC)cv::hal::sub64f, + 0 + }; - Cmp_SIMD vop(code); + return subTab; +} - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = vop(src1, src2, dst, size.width); - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] > src2[x]) ^ m; - t1 = -(src1[x+1] > src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] > src2[x+2]) ^ m; - t1 = -(src1[x+3] > src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - else if( code == CMP_EQ || code == CMP_NE ) +static BinaryFuncC* getAbsDiffTab() +{ + static BinaryFuncC absDiffTab[] = { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_ENABLE_UNROLLED - for( ; x <= size.width - 4; x += 4 ) - { - int t0, t1; - t0 = -(src1[x] == src2[x]) ^ m; - t1 = -(src1[x+1] == src2[x+1]) ^ m; - dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; - t0 = -(src1[x+2] == src2[x+2]) ^ m; - t1 = -(src1[x+3] == src2[x+3]) ^ m; - dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32f), (BinaryFuncC)cv::hal::absdiff64f, + 0 + }; + + return absDiffTab; +} + } -#if ARITHM_USE_IPP -inline static IppCmpOp convert_cmp(int _cmpop) +void cv::add( InputArray src1, InputArray src2, OutputArray dst, + InputArray mask, int dtype ) { - return _cmpop == CMP_EQ ? ippCmpEq : - _cmpop == CMP_GT ? ippCmpGreater : - _cmpop == CMP_GE ? ippCmpGreaterEq : - _cmpop == CMP_LT ? ippCmpLess : - _cmpop == CMP_LE ? ippCmpLessEq : - (IppCmpOp)-1; + arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD ); } -#endif -static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) +void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst, + InputArray mask, int dtype ) { -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) +#ifdef HAVE_TEGRA_OPTIMIZATION + if (tegra::useTegra()) { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } + int kind1 = _src1.kind(), kind2 = _src2.kind(); + Mat src1 = _src1.getMat(), src2 = _src2.getMat(); + bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2); + bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1); - if( code == CMP_GT || code == CMP_LE ) - { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) + if (!src1Scalar && !src2Scalar && + src1.depth() == CV_8U && src2.type() == src1.type() && + src1.dims == 2 && src2.size() == src1.size() && + mask.empty()) { - int x =0; - #if CV_SSE2 - if( USE_SSE2 ) + if (dtype < 0) { - __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); - __m128i c128 = _mm_set1_epi8 (-128); - for( ; x <= size.width - 16; x += 16 ) + if (_dst.fixedType()) { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - // no simd for 8u comparison, that's why we need the trick - r00 = _mm_sub_epi8(r00,c128); - r10 = _mm_sub_epi8(r10,c128); - - r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128); - _mm_storeu_si128((__m128i*)(dst + x),r00); - + dtype = _dst.depth(); + } + else + { + dtype = src1.depth(); } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); } - #endif + dtype = CV_MAT_DEPTH(dtype); - for( ; x < size.width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_SSE2 - if( USE_SSE2 ) + if (!_dst.fixedType() || dtype == _dst.depth()) { - __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); - for( ; x <= size.width - 16; x += 16 ) + _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels())); + + if (dtype == CV_16S) { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128); - _mm_storeu_si128((__m128i*)(dst + x), r00); + Mat dst = _dst.getMat(); + if(tegra::subtract_8u8u16s(src1, src2, dst)) + return; + } + else if (dtype == CV_32F) + { + Mat dst = _dst.getMat(); + if(tegra::subtract_8u8u32f(src1, src2, dst)) + return; + } + else if (dtype == CV_8S) + { + Mat dst = _dst.getMat(); + if(tegra::subtract_8u8u8s(src1, src2, dst)) + return; } } - #elif CV_NEON - uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); - - for( ; x <= size.width - 16; x += 16 ) - { - vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); } } +#endif + arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB ); } -static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) +void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) { - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); + arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF); } -static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) +/****************************************************************************************\ +* multiply/divide * +\****************************************************************************************/ + +namespace cv +{ + +static BinaryFuncC* getMulTab() { -#if ARITHM_USE_IPP - CV_IPP_CHECK() + static BinaryFuncC mulTab[] = { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); + (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u, + (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f, + (BinaryFuncC)cv::hal::mul64f, 0 + }; + + return mulTab; } -static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) +static BinaryFuncC* getDivTab() { -#if ARITHM_USE_IPP - CV_IPP_CHECK() + static BinaryFuncC divTab[] = { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); + (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u, + (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f, + (BinaryFuncC)cv::hal::div64f, 0 + }; - int code = *(int*)_cmpop; - step1 /= sizeof(src1[0]); - step2 /= sizeof(src2[0]); - if( code == CMP_GE || code == CMP_LT ) - { - std::swap(src1, src2); - std::swap(step1, step2); - code = code == CMP_GE ? CMP_LE : CMP_GT; - } + return divTab; +} - if( code == CMP_GT || code == CMP_LE ) +static BinaryFuncC* getRecipTab() +{ + static BinaryFuncC recipTab[] = { - int m = code == CMP_GT ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x =0; - #if CV_SSE2 - if( USE_SSE2) - { - __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); - __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); - __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); - r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128); - r11 = _mm_packs_epi16(r00, r01); - _mm_storeu_si128((__m128i*)(dst + x), r11); - } - if( x <= size.width-8) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); - r10 = _mm_packs_epi16(r00, r00); - _mm_storel_epi64((__m128i*)(dst + x), r10); + (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u, + (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f, + (BinaryFuncC)cv::hal::recip64f, 0 + }; - x += 8; - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + return recipTab; +} - for( ; x <= size.width - 16; x += 16 ) - { - int16x8_t in1 = vld1q_s16(src1 + x); - int16x8_t in2 = vld1q_s16(src2 + x); - uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); +} - in1 = vld1q_s16(src1 + x + 8); - in2 = vld1q_s16(src2 + x + 8); - uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); +void cv::multiply(InputArray src1, InputArray src2, + OutputArray dst, double scale, int dtype) +{ + arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), + true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); +} - vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); - } - #endif +void cv::divide(InputArray src1, InputArray src2, + OutputArray dst, double scale, int dtype) +{ + arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE); +} - for( ; x < size.width; x++ ){ - dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); - } - } - } - else if( code == CMP_EQ || code == CMP_NE ) - { - int m = code == CMP_EQ ? 0 : 255; - for( ; size.height--; src1 += step1, src2 += step2, dst += step ) - { - int x = 0; - #if CV_SSE2 - if( USE_SSE2 ) - { - __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); - for( ; x <= size.width - 16; x += 16 ) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); - __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); - __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); - r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128); - r11 = _mm_packs_epi16(r00, r01); - _mm_storeu_si128((__m128i*)(dst + x), r11); - } - if( x <= size.width - 8) - { - __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); - __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); - r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); - r10 = _mm_packs_epi16(r00, r00); - _mm_storel_epi64((__m128i*)(dst + x), r10); +void cv::divide(double scale, InputArray src2, + OutputArray dst, int dtype) +{ + arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE); +} - x += 8; - } - } - #elif CV_NEON - uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); +/****************************************************************************************\ +* addWeighted * +\****************************************************************************************/ - for( ; x <= size.width - 16; x += 16 ) - { - int16x8_t in1 = vld1q_s16(src1 + x); - int16x8_t in2 = vld1q_s16(src2 + x); - uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); +namespace cv +{ - in1 = vld1q_s16(src1 + x + 8); - in2 = vld1q_s16(src2 + x + 8); - uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); +static BinaryFuncC* getAddWeightedTab() +{ + static BinaryFuncC addWeightedTab[] = + { + (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f, + (BinaryFuncC)cv::hal::addWeighted64f, 0 + }; - vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); - } - #endif - for( ; x < size.width; x++ ) - dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); - } - } + return addWeightedTab; } -static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) -{ - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); } -static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) +void cv::addWeighted( InputArray src1, double alpha, InputArray src2, + double beta, double gamma, OutputArray dst, int dtype ) { -#if ARITHM_USE_IPP - CV_IPP_CHECK() - { - IppCmpOp op = convert_cmp(*(int *)_cmpop); - if( op >= 0 ) - { - fixSteps(size, sizeof(dst[0]), step1, step2, step); - if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op)) - { - CV_IMPL_ADD(CV_IMPL_IPP); - return; - } - setIppErrorStatus(); - } - } -#endif - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); + double scalars[] = {alpha, beta, gamma}; + arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW); } -static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, - uchar* dst, size_t step, Size size, void* _cmpop) + +/****************************************************************************************\ +* compare * +\****************************************************************************************/ + +namespace cv { - cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop); -} -static BinaryFunc getCmpFunc(int depth) +static BinaryFuncC getCmpFunc(int depth) { - static BinaryFunc cmpTab[] = + static BinaryFuncC cmpTab[] = { - (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s), - (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s), - (BinaryFunc)GET_OPTIMIZED(cmp32s), - (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f, + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32s), + (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32f), (BinaryFuncC)cv::hal::cmp64f, 0 }; @@ -5020,7 +1230,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) _dst.create(src1.size(), CV_8UC(cn)); Mat dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst, src1.channels()); - getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op); + getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, &op); return; } @@ -5032,7 +1242,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) size_t esz = src1.elemSize(); size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz; - BinaryFunc func = getCmpFunc(depth1); + BinaryFuncC func = getCmpFunc(depth1); if( !haveScalar ) { @@ -5043,7 +1253,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) size_t total = it.size; for( size_t i = 0; i < it.nplanes; i++, ++it ) - func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op ); + func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, (int)total, 1, &op ); } else { @@ -5095,7 +1305,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op) for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); - func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op); + func( ptrs[0], 0, buf, 0, ptrs[1], 0, bsz, 1, &op); ptrs[0] += bsz*esz; ptrs[1] += bsz; } diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index fbbea5e1b169374ea2a00fe5fa1b9157ebf593ed..6c693a43a07b19bbbead6abf287acb9a38ab0249 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -42,6 +42,7 @@ //M*/ #include "precomp.hpp" + #include "opencl_kernels_core.hpp" #ifdef __APPLE__ @@ -49,776 +50,37 @@ #define CV_NEON 0 #endif -namespace cv -{ /****************************************************************************************\ * split & merge * \****************************************************************************************/ -#if CV_NEON -template struct VSplit2; -template struct VSplit3; -template struct VSplit4; - -#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, \ - data_type* dst1) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - } \ - } - -#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - } \ - } - -#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - store_func(dst3, r.val[3]); \ - } \ - } - -SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); - -SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); - -SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); - -#elif CV_SSE2 - -template -struct VSplit2 -{ - VSplit2() : support(false) { } - void operator()(const T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit3 -{ - VSplit3() : support(false) { } - void operator()(const T *, T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit4 -{ - VSplit4() : support(false) { } - void operator()(const T *, T *, T *, T *, T *) const { } - - bool support; -}; - -#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit2() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - } \ - \ - bool support; \ -} - -#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit3() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1, data_type * dst2) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - } \ - \ - bool support; \ -} - -#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit4() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ - data_type * dst2, data_type * dst3) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ - reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ - } \ - \ - bool support; \ -} - -SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -#endif - -template static void -split_( const T* src, T** dst, int len, int cn ) -{ - int k = cn % 4 ? cn % 4 : 4; - int i, j; - if( k == 1 ) - { - T* dst0 = dst[0]; - - if(cn == 1) - { - memcpy(dst0, src, len * sizeof(T)); - } - else - { - for( i = 0, j = 0 ; i < len; i++, j += cn ) - dst0[i] = src[j]; - } - } - else if( k == 2 ) - { - T *dst0 = dst[0], *dst1 = dst[1]; - i = j = 0; - -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } -#elif CV_SSE2 - if (cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - } - } - else if( k == 3 ) - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; - i = j = 0; - -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } -#elif CV_SSE2 - if (cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - dst2[i] = src[j+2]; - } - } - else - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; - i = j = 0; - -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } -#elif CV_SSE2 - if (cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } - } -#endif - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } - - for( ; k < cn; k += 4 ) - { - T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; - for( i = 0, j = k; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } -} - - -#if CV_NEON -template struct VMerge2; -template struct VMerge3; -template struct VMerge4; - -#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - store_func(dst, r); \ - } \ - } - -#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - store_func(dst, r); \ - } \ - } - -#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, const data_type* src3, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - r.val[3] = load_func(src3); \ - store_func(dst, r); \ - } \ - } - -MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); - -MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); - -MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); - -#elif CV_SSE2 - -template -struct VMerge2 -{ - VMerge2() : support(false) { } - void operator()(const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge3 -{ - VMerge3() : support(false) { } - void operator()(const T *, const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge4 -{ - VMerge4() : support(false) { } - void operator()(const T *, const T *, const T *, const T *, T *) const { } - - bool support; -}; - -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge2() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - } \ - \ - bool support; \ -} - -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge3() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - } \ - \ - bool support; \ -} - -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge4() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - const data_type * src2, const data_type * src3, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ - reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ - } \ - \ - bool support; \ -} - -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); - -#if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -#endif - -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); - -#endif - -template static void -merge_( const T** src, T* dst, int len, int cn ) -{ - int k = cn % 4 ? cn % 4 : 4; - int i, j; - if( k == 1 ) - { - const T* src0 = src[0]; - for( i = j = 0; i < len; i++, j += cn ) - dst[j] = src0[i]; - } - else if( k == 2 ) - { - const T *src0 = src[0], *src1 = src[1]; - i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; - dst[j+1] = src1[i]; - } - } - else if( k == 3 ) - { - const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; - i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; - dst[j+1] = src1[i]; - dst[j+2] = src2[i]; - } - } - else - { - const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; - i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#endif - for( ; i < len; i++, j += cn ) - { - dst[j] = src0[i]; dst[j+1] = src1[i]; - dst[j+2] = src2[i]; dst[j+3] = src3[i]; - } - } - - for( ; k < cn; k += 4 ) - { - const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; - for( i = 0, j = k; i < len; i++, j += cn ) - { - dst[j] = src0[i]; dst[j+1] = src1[i]; - dst[j+2] = src2[i]; dst[j+3] = src3[i]; - } - } -} - -static void split8u(const uchar* src, uchar** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split16u(const ushort* src, ushort** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split32s(const int* src, int** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void split64s(const int64* src, int64** dst, int len, int cn ) -{ - split_(src, dst, len, cn); -} - -static void merge8u(const uchar** src, uchar* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge16u(const ushort** src, ushort* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge32s(const int** src, int* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - -static void merge64s(const int64** src, int64* dst, int len, int cn ) -{ - merge_(src, dst, len, cn); -} - typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); -typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); static SplitFunc getSplitFunc(int depth) { static SplitFunc splitTab[] = { - (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u), - (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0 + (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), + (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 }; return splitTab[depth]; } +typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn); + static MergeFunc getMergeFunc(int depth) { static MergeFunc mergeTab[] = { - (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u), - (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0 + (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), + (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0 }; return mergeTab[depth]; } -} - void cv::split(const Mat& src, Mat* mv) { int k, depth = src.depth(), cn = src.channels(); diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index 6d19744820826569589bbd87783584d64dadb485..d1f2ec22e19c6e502f307b0b705da5183a464222 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -83,6 +83,11 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1, uchar* dst, size_t step, Size sz, void*); +typedef void (*BinaryFuncC)(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void*); + BinaryFunc getConvertFunc(int sdepth, int ddepth); BinaryFunc getCopyMaskFunc(size_t esz); @@ -114,46 +119,6 @@ extern const uchar g_Saturate8u[]; void deleteThreadAllocData(); #endif -template struct OpAdd -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } -}; - -template struct OpSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } -}; - -template struct OpRSub -{ - typedef T1 type1; - typedef T2 type2; - typedef T3 rtype; - T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } -}; - -template struct OpMin -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct OpMax -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale ) { int64 sz = (int64)cols * rows * widthScale; @@ -201,11 +166,6 @@ struct NoVec size_t operator()(const void*, const void*, void*, size_t) const { return 0; } }; -extern volatile bool USE_SSE2; -extern volatile bool USE_SSE4_2; -extern volatile bool USE_AVX; -extern volatile bool USE_AVX2; - enum { BLOCK_SIZE = 1024 }; #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index dbe35ebfa40a21c21fc0305ac1b200d50d48099c..ba2c9d536f50c46736a0a9ca59a8762ea85b4239 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -86,45 +86,6 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex(); #undef max #undef abs #include -#if defined _MSC_VER - #if _MSC_VER >= 1400 - #include - #elif defined _M_IX86 - static void __cpuid(int* cpuid_data, int) - { - __asm - { - push ebx - push edi - mov edi, cpuid_data - mov eax, 1 - cpuid - mov [edi], eax - mov [edi + 4], ebx - mov [edi + 8], ecx - mov [edi + 12], edx - pop edi - pop ebx - } - } - static void __cpuidex(int* cpuid_data, int, int) - { - __asm - { - push edi - mov edi, cpuid_data - mov eax, 7 - mov ecx, 0 - cpuid - mov [edi], eax - mov [edi + 4], ebx - mov [edi + 8], ecx - mov [edi + 12], edx - pop edi - } - } - #endif -#endif #ifdef WINRT #include @@ -237,160 +198,15 @@ void Exception::formatMessage() msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str()); } -struct HWFeatures -{ - enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; - - HWFeatures(void) - { - memset( have, 0, sizeof(have) ); - x86_family = 0; - } - - static HWFeatures initialize(void) - { - HWFeatures f; - int cpuid_data[4] = { 0, 0, 0, 0 }; - - #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) - __cpuid(cpuid_data, 1); - #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) - #ifdef __x86_64__ - asm __volatile__ - ( - "movl $1, %%eax\n\t" - "cpuid\n\t" - :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) - : - : "cc" - ); - #else - asm volatile - ( - "pushl %%ebx\n\t" - "movl $1,%%eax\n\t" - "cpuid\n\t" - "popl %%ebx\n\t" - : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) - : - : "cc" - ); - #endif - #endif - - f.x86_family = (cpuid_data[0] >> 8) & 15; - if( f.x86_family >= 6 ) - { - f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0; - f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0; - f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; - f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; - f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; - f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; - f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; - f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; - f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; - f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX - - // make the second call to the cpuid command in order to get - // information about extended features like AVX2 - #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) - __cpuidex(cpuid_data, 7, 0); - #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) - #ifdef __x86_64__ - asm __volatile__ - ( - "movl $7, %%eax\n\t" - "movl $0, %%ecx\n\t" - "cpuid\n\t" - :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) - : - : "cc" - ); - #else - asm volatile - ( - "pushl %%ebx\n\t" - "movl $7,%%eax\n\t" - "movl $0,%%ecx\n\t" - "cpuid\n\t" - "movl %%ebx, %0\n\t" - "popl %%ebx\n\t" - : "=r"(cpuid_data[1]), "=c"(cpuid_data[2]) - : - : "cc" - ); - #endif - #endif - f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; - - f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; - f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; - f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; - f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; - f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; - f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; - f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; - f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; - f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; - } - - #if defined ANDROID || defined __linux__ - #ifdef __aarch64__ - f.have[CV_CPU_NEON] = true; - #else - int cpufile = open("/proc/self/auxv", O_RDONLY); - - if (cpufile >= 0) - { - Elf32_auxv_t auxv; - const size_t size_auxv_t = sizeof(auxv); - - while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) - { - if (auxv.a_type == AT_HWCAP) - { - f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; - break; - } - } - - close(cpufile); - } - #endif - #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) - f.have[CV_CPU_NEON] = true; - #endif - - return f; - } - - int x86_family; - bool have[MAX_FEATURE+1]; -}; - -static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures(); -static HWFeatures* currentFeatures = &featuresEnabled; - bool checkHardwareSupport(int feature) { CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); - return currentFeatures->have[feature]; + return cv::hal::checkHardwareSupport(feature); } - -volatile bool useOptimizedFlag = true; - -volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2]; -volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2]; -volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX]; -volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2]; - void setUseOptimized( bool flag ) { - useOptimizedFlag = flag; - currentFeatures = flag ? &featuresEnabled : &featuresDisabled; - USE_SSE2 = currentFeatures->have[CV_CPU_SSE2]; + cv::hal::setUseOptimized(flag); ipp::setUseIPP(flag); #ifdef HAVE_OPENCL @@ -403,7 +219,7 @@ void setUseOptimized( bool flag ) bool useOptimized(void) { - return useOptimizedFlag; + return cv::hal::useOptimized(); } int64 getTickCount(void) @@ -683,12 +499,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata) CV_IMPL int cvCheckHardwareSupport(int feature) { CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); - return cv::currentFeatures->have[feature]; + return cv::hal::checkHardwareSupport(feature); } CV_IMPL int cvUseOptimized( int flag ) { - int prevMode = cv::useOptimizedFlag; + int prevMode = cv::useOptimized(); cv::setUseOptimized( flag != 0 ); return prevMode; } diff --git a/modules/hal/CMakeLists.txt b/modules/hal/CMakeLists.txt index b04e96b9e7f92feb9b0e67025c30bf1582048cf3..982913dba7aab638abfecfabd23e728bee5d6f24 100644 --- a/modules/hal/CMakeLists.txt +++ b/modules/hal/CMakeLists.txt @@ -2,10 +2,20 @@ set(the_description "The Hardware Acceleration Layer (HAL) module") set(OPENCV_MODULE_TYPE STATIC) +if(OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS) + set(OPENCV_HAL_HEADERS_INCLUDES "#include \"${OPENCV_HAL_HEADERS}\"") + set(DEPS "${OPENCV_HAL_LIBS}") +else() + set(OPENCV_HAL_HEADERS_INCLUDES "// using default HAL") + set(DEPS "") +endif() + +configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY) + if(UNIX) if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") endif() endif() -ocv_define_module(hal) +ocv_define_module(hal ${DEPS}) diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp index 9d448757d282431dd3ef4a41ebc8d5fe665f84cc..125bbc824811cb4b22ddfdccf3aca8d400e2228c 100644 --- a/modules/hal/include/opencv2/hal.hpp +++ b/modules/hal/include/opencv2/hal.hpp @@ -46,6 +46,7 @@ #define __OPENCV_HAL_HPP__ #include "opencv2/hal/defs.h" +#include "opencv2/hal/interface.hpp" /** @defgroup hal Hardware Acceleration Layer @@ -58,22 +59,19 @@ @} */ - namespace cv { namespace hal { //! @addtogroup hal //! @{ -namespace Error { - -enum +class Failure { - Ok = 0, - Unknown = -1 +public: + Failure(int code_ = Error::Unknown) : code(code_) {} +public: + int code; }; -} - int normHamming(const uchar* a, int n); int normHamming(const uchar* a, const uchar* b, int n); @@ -104,8 +102,186 @@ void sqrt(const double* src, double* dst, int len); void invSqrt(const float* src, float* dst, int len); void invSqrt(const double* src, double* dst, int len); +void split8u(const uchar* src, uchar** dst, int len, int cn ); +void split16u(const ushort* src, ushort** dst, int len, int cn ); +void split32s(const int* src, int** dst, int len, int cn ); +void split64s(const int64* src, int64** dst, int len, int cn ); + +void merge8u(const uchar** src, uchar* dst, int len, int cn ); +void merge16u(const ushort** src, ushort* dst, int len, int cn ); +void merge32s(const int** src, int* dst, int len, int cn ); +void merge64s(const int64** src, int64* dst, int len, int cn ); + +void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* ); +void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* ); +void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* ); +void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* ); +void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* ); +void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* ); + +void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); +void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* ); + +void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); +void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop); + +void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale); +void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale); +void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale); +void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale); +void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale); +void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale); +void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale); + +void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars ); +void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars ); +void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars ); +void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars ); +void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars ); +void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars ); +void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars ); //! @} }} //cv::hal +namespace cv { + +template struct OpAdd +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a + b); } +}; + +template struct OpSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(a - b); } +}; + +template struct OpRSub +{ + typedef T1 type1; + typedef T2 type2; + typedef T3 rtype; + T3 operator ()(const T1 a, const T2 b) const { return saturate_cast(b - a); } +}; + +template struct OpMin +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::min(a, b); } +}; + +template struct OpMax +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::max(a, b); } +}; + +template struct OpAbsDiff +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()(T a, T b) const { return a > b ? a - b : b - a; } +}; + +template struct OpAnd +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a & b; } +}; + +template struct OpOr +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a | b; } +}; + +template struct OpXor +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T b ) const { return a ^ b; } +}; + +template struct OpNot +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator()( T a, T ) const { return ~a; } +}; + +} + #endif //__OPENCV_HAL_HPP__ diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h index d04f003879c290fb2361c01fba1f73825bce6a00..117ec6046921195b2f24bad86e8daecedf8d12a3 100644 --- a/modules/hal/include/opencv2/hal/defs.h +++ b/modules/hal/include/opencv2/hal/defs.h @@ -53,6 +53,7 @@ #endif #include +#include "opencv2/hal/interface.hpp" #if defined __ICL # define CV_ICC __ICL @@ -117,9 +118,38 @@ #define CV_CPU_NEON 100 -// when adding to this list remember to update the enum in core/utility.cpp +// when adding to this list remember to update the following enum #define CV_HARDWARE_MAX_FEATURE 255 +/** @brief Available CPU features. +*/ +enum CpuFeatures { + CPU_MMX = 1, + CPU_SSE = 2, + CPU_SSE2 = 3, + CPU_SSE3 = 4, + CPU_SSSE3 = 5, + CPU_SSE4_1 = 6, + CPU_SSE4_2 = 7, + CPU_POPCNT = 8, + + CPU_AVX = 10, + CPU_AVX2 = 11, + CPU_FMA3 = 12, + + CPU_AVX_512F = 13, + CPU_AVX_512BW = 14, + CPU_AVX_512CD = 15, + CPU_AVX_512DQ = 16, + CPU_AVX_512ER = 17, + CPU_AVX_512IFMA512 = 18, + CPU_AVX_512PF = 19, + CPU_AVX_512VBMI = 20, + CPU_AVX_512VL = 21, + + CPU_NEON = 100 +}; + // do not include SSE/AVX/NEON headers for NVCC compiler #ifndef __CUDACC__ @@ -257,49 +287,6 @@ # define CV_VFP 0 #endif -/* primitive types */ -/* - schar - signed 1 byte integer - uchar - unsigned 1 byte integer - short - signed 2 byte integer - ushort - unsigned 2 byte integer - int - signed 4 byte integer - uint - unsigned 4 byte integer - int64 - signed 8 byte integer - uint64 - unsigned 8 byte integer -*/ - -#if !defined _MSC_VER && !defined __BORLANDC__ -# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__ -# include - typedef std::uint32_t uint; -# else -# include - typedef uint32_t uint; -# endif -#else - typedef unsigned uint; -#endif - -typedef signed char schar; - -#ifndef __IPL_H__ - typedef unsigned char uchar; - typedef unsigned short ushort; -#endif - -#if defined _MSC_VER || defined __BORLANDC__ - typedef __int64 int64; - typedef unsigned __int64 uint64; -# define CV_BIG_INT(n) n##I64 -# define CV_BIG_UINT(n) n##UI64 -#else - typedef int64_t int64; - typedef uint64_t uint64; -# define CV_BIG_INT(n) n##LL -# define CV_BIG_UINT(n) n##ULL -#endif - /* fundamental constants */ #define CV_PI 3.1415926535897932384626433832795 #define CV_2PI 6.283185307179586476925286766559 @@ -321,6 +308,19 @@ typedef union Cv64suf } Cv64suf; +namespace cv { namespace hal { + +bool checkHardwareSupport(int feature); +void setUseOptimized(bool onoff); +bool useOptimized(); + +}} + +#define USE_SSE2 (cv::hal::checkHardwareSupport(CV_CPU_SSE)) +#define USE_SSE4_2 (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2)) +#define USE_AVX (cv::hal::checkHardwareSupport(CV_CPU_AVX)) +#define USE_AVX2 (cv::hal::checkHardwareSupport(CV_CPU_AVX2)) + /****************************************************************************************\ * fast math * diff --git a/modules/hal/include/opencv2/hal/interface.hpp b/modules/hal/include/opencv2/hal/interface.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a5bff04d7e5eb86f5df94c97ffa83e5eab02821 --- /dev/null +++ b/modules/hal/include/opencv2/hal/interface.hpp @@ -0,0 +1,91 @@ +#ifndef _HAL_INTERFACE_HPP_INCLUDED_ +#define _HAL_INTERFACE_HPP_INCLUDED_ + +#define CV_HAL_ERROR_OK 0 +#define CV_HAL_ERROR_NI 1 +#define CV_HAL_ERROR_UNKNOWN -1 + +#define CV_HAL_CMP_EQ 0 +#define CV_HAL_CMP_GT 1 +#define CV_HAL_CMP_GE 2 +#define CV_HAL_CMP_LT 3 +#define CV_HAL_CMP_LE 4 +#define CV_HAL_CMP_NE 5 + +#ifdef __cplusplus +namespace cv { namespace hal { + +namespace Error { + +enum +{ + Ok = 0, + NotImplemented = 1, + Unknown = -1 +}; + +} + +enum +{ + CMP_EQ = 0, + CMP_GT = 1, + CMP_GE = 2, + CMP_LT = 3, + CMP_LE = 4, + CMP_NE = 5 +}; + +}} +#endif + +#ifdef __cplusplus +#include +#else +#include +#endif + +/* primitive types */ +/* + schar - signed 1 byte integer + uchar - unsigned 1 byte integer + short - signed 2 byte integer + ushort - unsigned 2 byte integer + int - signed 4 byte integer + uint - unsigned 4 byte integer + int64 - signed 8 byte integer + uint64 - unsigned 8 byte integer +*/ + +#if !defined _MSC_VER && !defined __BORLANDC__ +# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__ +# include + typedef std::uint32_t uint; +# else +# include + typedef uint32_t uint; +# endif +#else + typedef unsigned uint; +#endif + +typedef signed char schar; + +#ifndef __IPL_H__ + typedef unsigned char uchar; + typedef unsigned short ushort; +#endif + +#if defined _MSC_VER || defined __BORLANDC__ + typedef __int64 int64; + typedef unsigned __int64 uint64; +# define CV_BIG_INT(n) n##I64 +# define CV_BIG_UINT(n) n##UI64 +#else + typedef int64_t int64; + typedef uint64_t uint64; +# define CV_BIG_INT(n) n##LL +# define CV_BIG_UINT(n) n##ULL +#endif + +#endif diff --git a/modules/hal/include/opencv2/hal/neon_utils.hpp b/modules/hal/include/opencv2/hal/neon_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6026777a6f8591891010335cde349ac4117df493 --- /dev/null +++ b/modules/hal/include/opencv2/hal/neon_utils.hpp @@ -0,0 +1,127 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_NEON_UTILS_HPP__ +#define __OPENCV_HAL_NEON_UTILS_HPP__ + +#include "opencv2/hal/defs.h" + +namespace cv { + +#if CV_NEON + +inline int32x2_t cv_vrnd_s32_f32(float32x2_t v) +{ + static int32x2_t v_sign = vdup_n_s32(1 << 31), + v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f)); + + int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v))); + return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition))); +} + +inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) +{ + static int32x4_t v_sign = vdupq_n_s32(1 << 31), + v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + + int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); + return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); +} + +inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v) +{ + static float32x2_t v_05 = vdup_n_f32(0.5f); + return vcvt_u32_f32(vadd_f32(v, v_05)); +} + +inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) +{ + static float32x4_t v_05 = vdupq_n_f32(0.5f); + return vcvtq_u32_f32(vaddq_f32(v, v_05)); +} + +inline float32x4_t cv_vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x2_t cv_vrecp_f32(float32x2_t val) +{ + float32x2_t reciprocal = vrecpe_f32(val); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) +{ + float32x4_t e = vrsqrteq_f32(val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + return e; +} + +inline float32x2_t cv_vrsqrt_f32(float32x2_t val) +{ + float32x2_t e = vrsqrte_f32(val); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + return e; +} + +inline float32x4_t cv_vsqrtq_f32(float32x4_t val) +{ + return cv_vrecpq_f32(cv_vrsqrtq_f32(val)); +} + +inline float32x2_t cv_vsqrt_f32(float32x2_t val) +{ + return cv_vrecp_f32(cv_vrsqrt_f32(val)); +} + +#endif + +} + +#endif // __OPENCV_HAL_NEON_UTILS_HPP__ diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/hal/include/opencv2/hal/sse_utils.hpp similarity index 99% rename from modules/core/include/opencv2/core/sse_utils.hpp rename to modules/hal/include/opencv2/hal/sse_utils.hpp index e0283eb3f33d911e782e86dd3044b93d730ae4e7..9ce4098bad6f49473bcdc886c881d5fe6d7e895f 100644 --- a/modules/core/include/opencv2/core/sse_utils.hpp +++ b/modules/hal/include/opencv2/hal/sse_utils.hpp @@ -46,6 +46,8 @@ # error sse_utils.hpp header must be compiled as C++ #endif +#include "opencv2/hal/defs.h" + #if CV_SSE2 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) diff --git a/modules/hal/samples/simple_hal/CMakeLists.txt b/modules/hal/samples/simple_hal/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0be70f2fa7a334916a3a83edbd3c92015689a2 --- /dev/null +++ b/modules/hal/samples/simple_hal/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) + +if(UNIX) + if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + endif() +endif() + +add_library(simple_hal simple.cpp) +set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..") +target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include) diff --git a/modules/hal/samples/simple_hal/simple.cpp b/modules/hal/samples/simple_hal/simple.cpp new file mode 100644 index 0000000000000000000000000000000000000000..564a611a5a1f0b07a01baf94baf752ca5309ad28 --- /dev/null +++ b/modules/hal/samples/simple_hal/simple.cpp @@ -0,0 +1,33 @@ +#include "simple.hpp" + +int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] & src2[x]; + return cv::hal::Error::Ok; +} + +int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] | src2[x]; + return cv::hal::Error::Ok; +} + +int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = src1[x] ^ src2[x]; + return cv::hal::Error::Ok; +} + +int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) +{ + for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) + for(int x = 0 ; x < width; x++ ) + dst[x] = ~src1[x]; + return cv::hal::Error::Ok; +} diff --git a/modules/hal/samples/simple_hal/simple.hpp b/modules/hal/samples/simple_hal/simple.hpp new file mode 100644 index 0000000000000000000000000000000000000000..85a16535de798bc31ad3985319186363ef15af84 --- /dev/null +++ b/modules/hal/samples/simple_hal/simple.hpp @@ -0,0 +1,20 @@ +#ifndef _SIMPLE_HPP_INCLUDED_ +#define _SIMPLE_HPP_INCLUDED_ + +#include "opencv2/hal/interface.hpp" + +int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); +int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); + +#undef hal_and8u +#define hal_and8u slow_and8u +#undef hal_or8u +#define hal_or8u slow_or8u +#undef hal_xor8u +#define hal_xor8u slow_xor8u +#undef hal_not8u +#define hal_not8u slow_not8u + +#endif diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp index a3f69facca087baa4149132fc7e457a2e1833847..e30cd7d9e5e759d887dba4cbce6a6dfe0cea3ea8 100644 --- a/modules/hal/src/arithm.cpp +++ b/modules/hal/src/arithm.cpp @@ -7,11 +7,13 @@ // copy or use the software. // // -// License Agreement +// License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -41,7 +43,1089 @@ //M*/ #include "precomp.hpp" +#include "arithm_simd.hpp" +#include "arithm_core.hpp" +#include "replacement.hpp" namespace cv { namespace hal { -}} +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if (ARITHM_USE_IPP == 1) +static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step) +{ + if( height == 1 ) + step1 = step2 = step = width*elemSize; +} +#define CALL_IPP_BIN_12(fun) \ + CV_IPP_CHECK() \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_BIN_12(fun) +#endif + +//======================================= +// Add +//======================================= + +void add8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add8u) + CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add8s) + vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +void add16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add16u) + CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add16s) + CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs) + (vBinOp, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add32s) + vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +void add32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add32f) + CALL_IPP_BIN_12(ippiAdd_32f_C1R) + (vBinOp32, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void add64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_add64f) + vBinOp64, IF_SIMD(VAdd)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= + +#if (ARITHM_USE_IPP == 1) +#define CALL_IPP_BIN_21(fun) \ + CV_IPP_CHECK() \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_BIN_21(fun) +#endif + +//======================================= +// Subtract +//======================================= + +void sub8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub8u) + CALL_IPP_BIN_21(ippiSub_8u_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub8s) + vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +void sub16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub16u) + CALL_IPP_BIN_21(ippiSub_16u_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub16s) + CALL_IPP_BIN_21(ippiSub_16s_C1RSfs) + (vBinOp, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub32s) + vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +void sub32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub32f) + CALL_IPP_BIN_21(ippiSub_32f_C1R) + (vBinOp32, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void sub64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_sub64f) + vBinOp64, IF_SIMD(VSub)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= + +#if (ARITHM_USE_IPP == 1) +#define CALL_IPP_MIN_MAX(fun, type) \ + CV_IPP_CHECK() \ + { \ + type* s1 = (type*)src1; \ + type* s2 = (type*)src2; \ + type* d = dst; \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + int i = 0; \ + for(; i < height; i++) \ + { \ + if (0 > fun(s1, s2, d, width)) \ + break; \ + s1 = (type*)((uchar*)s1 + step1); \ + s2 = (type*)((uchar*)s2 + step2); \ + d = (type*)((uchar*)d + step); \ + } \ + if (i == height) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } +#else +#define CALL_IPP_MIN_MAX(fun, type) +#endif + +//======================================= +// Max +//======================================= + +void max8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max8u) + CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max8s) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max16u) + CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max16s) + vBinOp, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max32s) + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max32f) + CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float) + vBinOp32, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +void max64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_max64f) + CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double) + vBinOp64, IF_SIMD(VMax)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// Min +//======================================= + +void min8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min8u) + CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min8s) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min16u) + CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min16s) + vBinOp, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min32s) + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min32f) + CALL_IPP_MIN_MAX(ippsMinEvery_32f, float) + vBinOp32, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +void min64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_min64f) + CALL_IPP_MIN_MAX(ippsMinEvery_64f, double) + vBinOp64, IF_SIMD(VMin)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// AbsDiff +//======================================= + +void absdiff8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff8u) + CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R) + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff8s( const schar* src1, size_t step1, + const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff8s) + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff16u( const ushort* src1, size_t step1, + const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff16u) + CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R) + (vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff16s( const short* src1, size_t step1, + const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff16s) + vBinOp, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff32s( const int* src1, size_t step1, + const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff32s) + vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +void absdiff32f( const float* src1, size_t step1, + const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff32f) + CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R) + (vBinOp32, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void absdiff64f( const double* src1, size_t step1, + const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_absdiff64f) + vBinOp64, IF_SIMD(VAbsDiff)>(src1, step1, src2, step2, dst, step, width, height); +} + +//======================================= +// Logical +//======================================= + +void and8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_and8u) + CALL_IPP_BIN_12(ippiAnd_8u_C1R) + (vBinOp, IF_SIMD(VAnd)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void or8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_or8u) + CALL_IPP_BIN_12(ippiOr_8u_C1R) + (vBinOp, IF_SIMD(VOr)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void xor8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_xor8u) + CALL_IPP_BIN_12(ippiXor_8u_C1R) + (vBinOp, IF_SIMD(VXor)>(src1, step1, src2, step2, dst, step, width, height)); +} + +void not8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* ) +{ + CALL_HAL(hal_not8u) + CALL_IPP_BIN_12(ippiNot_8u_C1R) + (vBinOp, IF_SIMD(VNot)>(src1, step1, src2, step2, dst, step, width, height)); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if ARITHM_USE_IPP +inline static IppCmpOp convert_cmp(int _cmpop) +{ + return _cmpop == CMP_EQ ? ippCmpEq : + _cmpop == CMP_GT ? ippCmpGreater : + _cmpop == CMP_GE ? ippCmpGreaterEq : + _cmpop == CMP_LT ? ippCmpLess : + _cmpop == CMP_LE ? ippCmpLessEq : + (IppCmpOp)-1; +} +#define CALL_IPP_CMP(fun) \ + CV_IPP_CHECK() \ + { \ + IppCmpOp op = convert_cmp(*(int *)_cmpop); \ + if( op >= 0 ) \ + { \ + fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \ + if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } \ + } +#else +#define CALL_IPP_CMP(fun) +#endif + +//======================================= +// Compare +//======================================= + +void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp8u) + CALL_IPP_CMP(ippiCompare_8u_C1R) + //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); + int code = *(int*)_cmpop; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x =0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1); + __m128i c128 = _mm_set1_epi8 (-128); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + // no simd for 8u comparison, that's why we need the trick + r00 = _mm_sub_epi8(r00,c128); + r10 = _mm_sub_epi8(r10,c128); + + r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128); + _mm_storeu_si128((__m128i*)(dst + x),r00); + + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } + + #endif + + for( ; x < width; x++ ){ + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128); + _mm_storeu_si128((__m128i*)(dst + x), r00); + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask)); + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp8s) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp16u) + CALL_IPP_CMP(ippiCompare_16u_C1R) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp16s) + CALL_IPP_CMP(ippiCompare_16s_C1R) + //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); + + int code = *(int*)_cmpop; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x =0; + #if CV_SSE2 + if( USE_SSE2) + { + __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); + __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); + __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); + r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128); + r11 = _mm_packs_epi16(r00, r01); + _mm_storeu_si128((__m128i*)(dst + x), r11); + } + if( x <= width-8) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128); + r10 = _mm_packs_epi16(r00, r00); + _mm_storel_epi64((__m128i*)(dst + x), r10); + + x += 8; + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } + #endif + + for( ; x < width; x++ ){ + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_SSE2 + if( USE_SSE2 ) + { + __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1); + for( ; x <= width - 16; x += 16 ) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); + __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8)); + __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8)); + r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128); + r11 = _mm_packs_epi16(r00, r01); + _mm_storeu_si128((__m128i*)(dst + x), r11); + } + if( x <= width - 8) + { + __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x)); + __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x)); + r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128); + r10 = _mm_packs_epi16(r00, r00); + _mm_storel_epi64((__m128i*)(dst + x), r10); + + x += 8; + } + } + #elif CV_NEON + uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255); + + for( ; x <= width - 16; x += 16 ) + { + int16x8_t in1 = vld1q_s16(src1 + x); + int16x8_t in2 = vld1q_s16(src2 + x); + uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2)); + + in1 = vld1q_s16(src1 + x + 8); + in2 = vld1q_s16(src2 + x + 8); + uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2)); + + vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask)); + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp32s) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp32f) + CALL_IPP_CMP(ippiCompare_32f_C1R) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* _cmpop) +{ + CALL_HAL(hal_cmp64f) + cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +#if defined HAVE_IPP +#define CALL_IPP_MUL(fun) \ + CV_IPP_CHECK() \ + { \ + if (std::fabs(fscale - 1) <= FLT_EPSILON) \ + { \ + if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \ + { \ + CV_IMPL_ADD(CV_IMPL_IPP); \ + return; \ + } \ + setIppErrorStatus(); \ + } \ + } +#else +#define CALL_IPP_MUL(fun) +#endif + +//======================================= +// Multilpy +//======================================= + +void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul8u) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_8u_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul8s) + mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale); +} + +void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul16u) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_16u_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul16s) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_16s_C1RSfs) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul32s) + mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul32f) + float fscale = (float)*(const double*)scale; + CALL_IPP_MUL(ippiMul_32f_C1R) + mul_(src1, step1, src2, step2, dst, step, width, height, fscale); +} + +void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_mul64f) + mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= +// Divide +//======================================= + +void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div8u) + if( src1 ) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); + else + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div8s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div16u) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div16s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div32s) + div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div32f) + div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void div64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_div64f) + div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= +// Reciprocial +//======================================= + +void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip8u) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip8s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip16u) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip16s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip32s) + recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip32f) + recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scale) +{ + CALL_HAL(hal_recip64f) + recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); +} + +//======================================= + +#undef CALL_HAL +#define CALL_HAL(fun) \ + int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \ + if (res == Error::Ok) \ + return; \ + else if (res != Error::NotImplemented) \ + throw Failure(res); + +//======================================= +// Add weighted +//======================================= + +void +addWeighted8u( const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* scalars ) +{ + CALL_HAL(hal_addWeighted8u) + const double* scalars_ = (const double*)scalars; + float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2]; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + +#if CV_SSE2 + if( USE_SSE2 ) + { + __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); + __m128i z = _mm_setzero_si128(); + + for( ; x <= width - 8; x += 8 ) + { + __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); + __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); + + __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); + __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); + __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); + __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); + + u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); + u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); + u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); + + u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); + u = _mm_packus_epi16(u, u); + + _mm_storel_epi64((__m128i*)(dst + x), u); + } + } +#elif CV_NEON + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= width - 8; x += 8 ) + { + uint8x8_t in1 = vld1_u8(src1+x); + uint16x8_t in1_16 = vmovl_u8(in1); + float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); + + uint8x8_t in2 = vld1_u8(src2+x); + uint16x8_t in2_16 = vmovl_u8(in2); + float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); + uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); + + uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); + uint8x8_t out = vqmovn_u16(out_16); + + vst1_u8(dst+x, out); + } +#endif + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + float t0, t1; + t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; + t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; + + dst[x] = saturate_cast(t0); + dst[x+1] = saturate_cast(t1); + + t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; + t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; + + dst[x+2] = saturate_cast(t0); + dst[x+3] = saturate_cast(t1); + } + #endif + + for( ; x < width; x++ ) + { + float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; + dst[x] = saturate_cast(t0); + } + } +} + +void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, + schar* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted8s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, + ushort* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted16u) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, + short* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted16s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, + int* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted32s) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, + float* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted32f) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, + double* dst, size_t step, int width, int height, void* scalars ) +{ + CALL_HAL(hal_addWeighted64f) + addWeighted_(src1, step1, src2, step2, dst, step, width, height, scalars); +} + +}} // cv::hal:: diff --git a/modules/hal/src/arithm_core.hpp b/modules/hal/src/arithm_core.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a65e74c3812623111318fedd7c4f46a99e6dbc65 --- /dev/null +++ b/modules/hal/src/arithm_core.hpp @@ -0,0 +1,657 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__ +#define __OPENCV_HAL_ARITHM_CORE_HPP__ + +#include "arithm_simd.hpp" + +const uchar g_Saturate8u[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255 +}; + + +#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256]) +#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) +#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) + +const float g_8x32fTab[] = +{ + -128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f, + -120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f, + -112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f, + -104.f, -103.f, -102.f, -101.f, -100.f, -99.f, -98.f, -97.f, + -96.f, -95.f, -94.f, -93.f, -92.f, -91.f, -90.f, -89.f, + -88.f, -87.f, -86.f, -85.f, -84.f, -83.f, -82.f, -81.f, + -80.f, -79.f, -78.f, -77.f, -76.f, -75.f, -74.f, -73.f, + -72.f, -71.f, -70.f, -69.f, -68.f, -67.f, -66.f, -65.f, + -64.f, -63.f, -62.f, -61.f, -60.f, -59.f, -58.f, -57.f, + -56.f, -55.f, -54.f, -53.f, -52.f, -51.f, -50.f, -49.f, + -48.f, -47.f, -46.f, -45.f, -44.f, -43.f, -42.f, -41.f, + -40.f, -39.f, -38.f, -37.f, -36.f, -35.f, -34.f, -33.f, + -32.f, -31.f, -30.f, -29.f, -28.f, -27.f, -26.f, -25.f, + -24.f, -23.f, -22.f, -21.f, -20.f, -19.f, -18.f, -17.f, + -16.f, -15.f, -14.f, -13.f, -12.f, -11.f, -10.f, -9.f, + -8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, + 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f, + 32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, + 40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f, + 48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f, + 56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f, + 64.f, 65.f, 66.f, 67.f, 68.f, 69.f, 70.f, 71.f, + 72.f, 73.f, 74.f, 75.f, 76.f, 77.f, 78.f, 79.f, + 80.f, 81.f, 82.f, 83.f, 84.f, 85.f, 86.f, 87.f, + 88.f, 89.f, 90.f, 91.f, 92.f, 93.f, 94.f, 95.f, + 96.f, 97.f, 98.f, 99.f, 100.f, 101.f, 102.f, 103.f, + 104.f, 105.f, 106.f, 107.f, 108.f, 109.f, 110.f, 111.f, + 112.f, 113.f, 114.f, 115.f, 116.f, 117.f, 118.f, 119.f, + 120.f, 121.f, 122.f, 123.f, 124.f, 125.f, 126.f, 127.f, + 128.f, 129.f, 130.f, 131.f, 132.f, 133.f, 134.f, 135.f, + 136.f, 137.f, 138.f, 139.f, 140.f, 141.f, 142.f, 143.f, + 144.f, 145.f, 146.f, 147.f, 148.f, 149.f, 150.f, 151.f, + 152.f, 153.f, 154.f, 155.f, 156.f, 157.f, 158.f, 159.f, + 160.f, 161.f, 162.f, 163.f, 164.f, 165.f, 166.f, 167.f, + 168.f, 169.f, 170.f, 171.f, 172.f, 173.f, 174.f, 175.f, + 176.f, 177.f, 178.f, 179.f, 180.f, 181.f, 182.f, 183.f, + 184.f, 185.f, 186.f, 187.f, 188.f, 189.f, 190.f, 191.f, + 192.f, 193.f, 194.f, 195.f, 196.f, 197.f, 198.f, 199.f, + 200.f, 201.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f, + 208.f, 209.f, 210.f, 211.f, 212.f, 213.f, 214.f, 215.f, + 216.f, 217.f, 218.f, 219.f, 220.f, 221.f, 222.f, 223.f, + 224.f, 225.f, 226.f, 227.f, 228.f, 229.f, 230.f, 231.f, + 232.f, 233.f, 234.f, 235.f, 236.f, 237.f, 238.f, 239.f, + 240.f, 241.f, 242.f, 243.f, 244.f, 245.f, 246.f, 247.f, + 248.f, 249.f, 250.f, 251.f, 252.f, 253.f, 254.f, 255.f +}; + +#define CV_8TO32F(x) g_8x32fTab[(x)+128] + +namespace cv { + +template<> inline uchar OpAdd::operator ()(uchar a, uchar b) const +{ return CV_FAST_CAST_8U(a + b); } + +template<> inline uchar OpSub::operator ()(uchar a, uchar b) const +{ return CV_FAST_CAST_8U(a - b); } + +template<> inline short OpAbsDiff::operator ()(short a, short b) const +{ return saturate_cast(std::abs(a - b)); } + +template<> inline schar OpAbsDiff::operator ()(schar a, schar b) const +{ return saturate_cast(std::abs(a - b)); } + +template<> inline uchar OpMin::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } + +template<> inline uchar OpMax::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } + +} + +namespace cv { namespace hal { + +template +void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 || CV_NEON + VOp vop; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = vop(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else +#if CV_SSE2 + if( USE_SSE2 ) + { +#endif // CV_SSE2 + for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) + { + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 16/sizeof(T)); + r0 = vop(r0, VLoadStore128::load(src2 + x )); + r1 = vop(r1, VLoadStore128::load(src2 + x + 16/sizeof(T))); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 16/sizeof(T), r1); + } +#if CV_SSE2 + } +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_AVX2 + // nothing +#elif CV_SSE2 + if( USE_SSE2 ) + { + for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) ) + { + typename VLoadStore64::reg_type r = VLoadStore64::load(src1 + x); + r = vop(r, VLoadStore64::load(src2 + x)); + VLoadStore64::store(dst + x, r); + } + } +#endif + +#if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } +#endif + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + +template +void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 || CV_NEON + Op32 op32; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op32(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 + if( USE_SSE2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op32(r1, VLoadStore128Aligned::load(src2 + x + 4)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 4, r1); + } + } + } +#endif // CV_AVX2 + +#if CV_NEON || CV_SSE2 +#if CV_AVX2 + if( USE_AVX2 ) + { + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore256::reg_type r0 = VLoadStore256::load(src1 + x); + r0 = op32(r0, VLoadStore256::load(src2 + x)); + VLoadStore256::store(dst + x, r0); + } + } +#else +#if CV_SSE2 + if( USE_SSE2 ) + { +#endif // CV_SSE2 + for( ; x <= width - 8; x += 8 ) + { + typename VLoadStore128::reg_type r0 = VLoadStore128::load(src1 + x ); + typename VLoadStore128::reg_type r1 = VLoadStore128::load(src1 + x + 4); + r0 = op32(r0, VLoadStore128::load(src2 + x )); + r1 = op32(r1, VLoadStore128::load(src2 + x + 4)); + VLoadStore128::store(dst + x , r0); + VLoadStore128::store(dst + x + 4, r1); + } +#if CV_SSE2 + } +#endif // CV_SSE2 +#endif // CV_AVX2 +#endif // CV_NEON || CV_SSE2 + +#if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } +#endif + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + + +template +void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height) +{ +#if CV_SSE2 + Op64 op64; +#endif + Op op; + + for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), + src2 = (const T *)((const uchar *)src2 + step2), + dst = (T *)((uchar *)dst + step) ) + { + int x = 0; + +#if CV_AVX2 + if( USE_AVX2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) + { + for( ; x <= width - 4; x += 4 ) + { + typename VLoadStore256Aligned::reg_type r0 = VLoadStore256Aligned::load(src1 + x); + r0 = op64(r0, VLoadStore256Aligned::load(src2 + x)); + VLoadStore256Aligned::store(dst + x, r0); + } + } + } +#elif CV_SSE2 + if( USE_SSE2 ) + { + if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) + { + for( ; x <= width - 4; x += 4 ) + { + typename VLoadStore128Aligned::reg_type r0 = VLoadStore128Aligned::load(src1 + x ); + typename VLoadStore128Aligned::reg_type r1 = VLoadStore128Aligned::load(src1 + x + 2); + r0 = op64(r0, VLoadStore128Aligned::load(src2 + x )); + r1 = op64(r1, VLoadStore128Aligned::load(src2 + x + 2)); + VLoadStore128Aligned::store(dst + x , r0); + VLoadStore128Aligned::store(dst + x + 2, r1); + } + } + } +#endif + + for( ; x <= width - 4; x += 4 ) + { + T v0 = op(src1[x], src2[x]); + T v1 = op(src1[x+1], src2[x+1]); + dst[x] = v0; dst[x+1] = v1; + v0 = op(src1[x+2], src2[x+2]); + v1 = op(src1[x+3], src2[x+3]); + dst[x+2] = v0; dst[x+3] = v1; + } + + for( ; x < width; x++ ) + dst[x] = op(src1[x], src2[x]); + } +} + +template static void +cmp_(const T* src1, size_t step1, const T* src2, size_t step2, + uchar* dst, size_t step, int width, int height, int code) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + if( code == CMP_GE || code == CMP_LT ) + { + std::swap(src1, src2); + std::swap(step1, step2); + code = code == CMP_GE ? CMP_LE : CMP_GT; + } + + Cmp_SIMD vop(code); + + if( code == CMP_GT || code == CMP_LE ) + { + int m = code == CMP_GT ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = vop(src1, src2, dst, width); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + int t0, t1; + t0 = -(src1[x] > src2[x]) ^ m; + t1 = -(src1[x+1] > src2[x+1]) ^ m; + dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; + t0 = -(src1[x+2] > src2[x+2]) ^ m; + t1 = -(src1[x+3] > src2[x+3]) ^ m; + dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); + } + } + else if( code == CMP_EQ || code == CMP_NE ) + { + int m = code == CMP_EQ ? 0 : 255; + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = 0; + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + int t0, t1; + t0 = -(src1[x] == src2[x]) ^ m; + t1 = -(src1[x+1] == src2[x+1]) ^ m; + dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; + t0 = -(src1[x+2] == src2[x+2]) ^ m; + t1 = -(src1[x+3] == src2[x+3]) ^ m; + dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); + } + } +} + +template static void +mul_( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, WT scale ) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Mul_SIMD vop; + + if( scale == (WT)1. ) + { + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + #if CV_ENABLE_UNROLLED + for(; i <= width - 4; i += 4 ) + { + T t0; + T t1; + t0 = saturate_cast(src1[i ] * src2[i ]); + t1 = saturate_cast(src1[i+1] * src2[i+1]); + dst[i ] = t0; + dst[i+1] = t1; + + t0 = saturate_cast(src1[i+2] * src2[i+2]); + t1 = saturate_cast(src1[i+3] * src2[i+3]); + dst[i+2] = t0; + dst[i+3] = t1; + } + #endif + for( ; i < width; i++ ) + dst[i] = saturate_cast(src1[i] * src2[i]); + } + } + else + { + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + #if CV_ENABLE_UNROLLED + for(; i <= width - 4; i += 4 ) + { + T t0 = saturate_cast(scale*(WT)src1[i]*src2[i]); + T t1 = saturate_cast(scale*(WT)src1[i+1]*src2[i+1]); + dst[i] = t0; dst[i+1] = t1; + + t0 = saturate_cast(scale*(WT)src1[i+2]*src2[i+2]); + t1 = saturate_cast(scale*(WT)src1[i+3]*src2[i+3]); + dst[i+2] = t0; dst[i+3] = t1; + } + #endif + for( ; i < width; i++ ) + dst[i] = saturate_cast(scale*(WT)src1[i]*src2[i]); + } + } +} + + +template static void +div_i( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + float scale_f = (float)scale; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + for( ; i < width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +div_f( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + T scale_f = (T)scale; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Div_SIMD vop; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int i = vop(src1, src2, dst, width, scale); + for( ; i < width; i++ ) + { + T num = src1[i], denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(num*scale_f/denom) : (T)0; + } + } +} + +template static void +recip_i( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + float scale_f = (float)scale; + + for( ; height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, width, scale); + for( ; i < width; i++ ) + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } + } +} + +template static void +recip_f( const T*, size_t, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, double scale ) +{ + T scale_f = (T)scale; + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + Recip_SIMD vop; + + for( ; height--; src2 += step2, dst += step ) + { + int i = vop(src2, dst, width, scale); + for( ; i < width; i++ ) + { + T denom = src2[i]; + dst[i] = denom != 0 ? saturate_cast(scale_f/denom) : (T)0; + } + } +} + +template static void +addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, + T* dst, size_t step, int width, int height, void* _scalars ) +{ + const double* scalars = (const double*)_scalars; + WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; + step1 /= sizeof(src1[0]); + step2 /= sizeof(src2[0]); + step /= sizeof(dst[0]); + + AddWeighted_SIMD vop; + + for( ; height--; src1 += step1, src2 += step2, dst += step ) + { + int x = vop(src1, src2, dst, width, alpha, beta, gamma); + #if CV_ENABLE_UNROLLED + for( ; x <= width - 4; x += 4 ) + { + T t0 = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); + T t1 = saturate_cast(src1[x+1]*alpha + src2[x+1]*beta + gamma); + dst[x] = t0; dst[x+1] = t1; + + t0 = saturate_cast(src1[x+2]*alpha + src2[x+2]*beta + gamma); + t1 = saturate_cast(src1[x+3]*alpha + src2[x+3]*beta + gamma); + dst[x+2] = t0; dst[x+3] = t1; + } + #endif + for( ; x < width; x++ ) + dst[x] = saturate_cast(src1[x]*alpha + src2[x]*beta + gamma); + } +} + +}} // cv::hal:: + + +#endif // __OPENCV_HAL_ARITHM_CORE_HPP__ diff --git a/modules/hal/src/arithm_simd.hpp b/modules/hal/src/arithm_simd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4e4029875c23dd973b6df691e377705717d98664 --- /dev/null +++ b/modules/hal/src/arithm_simd.hpp @@ -0,0 +1,2025 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__ +#define __OPENCV_HAL_ARITHM_SIMD_HPP__ + +namespace cv { namespace hal { + +struct NOP {}; + +#if CV_SSE2 || CV_NEON +#define IF_SIMD(op) op +#else +#define IF_SIMD(op) NOP +#endif + + +#if CV_SSE2 || CV_NEON + +#define FUNCTOR_TEMPLATE(name) \ + template struct name {} + +FUNCTOR_TEMPLATE(VLoadStore128); +#if CV_SSE2 +FUNCTOR_TEMPLATE(VLoadStore64); +FUNCTOR_TEMPLATE(VLoadStore128Aligned); +#if CV_AVX2 +FUNCTOR_TEMPLATE(VLoadStore256); +FUNCTOR_TEMPLATE(VLoadStore256Aligned); +#endif +#endif + +#endif + +#if CV_AVX2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ + template<> \ + struct name \ + { \ + VLoadStore256::reg_type operator()( \ + const VLoadStore256::reg_type & a, \ + const VLoadStore256::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); +FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); + + +static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, + 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, + 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m256i d = _mm256_subs_epi8(a, b); + __m256i m = _mm256_cmpgt_epi8(b, a); + return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m256i M = _mm256_max_epi16(a, b); + __m256i m = _mm256_min_epi16(a, b); + return _mm256_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m256i d = _mm256_sub_epi32(a, b); + __m256i m = _mm256_cmpgt_epi32(b, a); + return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); + +#elif CV_SSE2 + +#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ + static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ + } + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p); } \ + static void store(template_arg * p, reg_type v) { store_body (p, v); } \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & b) const \ + { \ + body; \ + } \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + const VLoadStore128::reg_type & a, \ + const VLoadStore128::reg_type & ) const \ + { \ + body; \ + } \ + } + +FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); +FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); +FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); + +FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); +FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); + +FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); +FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, + __m128i m = _mm_cmpgt_epi8(a, b); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); +FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, + __m128i m = _mm_cmpgt_epi32(a, b); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, + __m128i m = _mm_cmpgt_epi8(b, a); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); +FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, + __m128i m = _mm_cmpgt_epi32(b, a); + return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); + ); +FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); + + +static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; +static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, + return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, + __m128i d = _mm_subs_epi8(a, b); + __m128i m = _mm_cmpgt_epi8(b, a); + return _mm_subs_epi8(_mm_xor_si128(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, + return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, + __m128i M = _mm_max_epi16(a, b); + __m128i m = _mm_min_epi16(a, b); + return _mm_subs_epi16(M, m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, + __m128i d = _mm_sub_epi32(a, b); + __m128i m = _mm_cmpgt_epi32(b, a); + return _mm_sub_epi32(_mm_xor_si128(d, m), m); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, + return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); + ); +FUNCTOR_CLOSURE_2arg(VAbsDiff, double, + return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); + ); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); +#endif + +#if CV_NEON + +#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ + template <> \ + struct name{ \ + typedef register_type reg_type; \ + static reg_type load(const template_arg * p) { return load_body (p);}; \ + static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ + } + +#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + VLoadStore128::reg_type a, \ + VLoadStore128::reg_type b) const \ + { \ + return body; \ + }; \ + } + +#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ + template<> \ + struct name \ + { \ + VLoadStore128::reg_type operator()( \ + VLoadStore128::reg_type a, \ + VLoadStore128::reg_type ) const \ + { \ + return body; \ + }; \ + } + +FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); +FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); +FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); +FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); +FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); +FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); + +FUNCTOR_TEMPLATE(VAdd); +FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VSub); +FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VMin); +FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); +FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); + +FUNCTOR_TEMPLATE(VMax); +FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); +FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); +FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); + +FUNCTOR_TEMPLATE(VAbsDiff); +FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); +FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); +FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); +FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); + +FUNCTOR_TEMPLATE(VAnd); +FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); +FUNCTOR_TEMPLATE(VOr); +FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); +FUNCTOR_TEMPLATE(VXor); +FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); +FUNCTOR_TEMPLATE(VNot); +FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); +#endif + + +template +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int) + { + } + + int operator () (const T *, const T *, uchar *, int) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdupq_n_u8(255); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask)); + + return x; + } + + int code; + uint8x16_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x)); + vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x)); + uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + v_mask = vdup_n_u8(255); + } + + int operator () (const float * src1, const float * src2, uchar * dst, int width) const + { + int x = 0; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)))); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))); + vst1_u8(dst + x, veor_u8(v_dst, v_mask)); + } + + return x; + } + + int code; + uint8x8_t v_mask; +}; + +#elif CV_SSE2 + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi8(-1); + } + + int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_LE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 16; x += 16) + _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x)))); + else if (code == CMP_NE) + for ( ; x <= width - 16; x += 16) + { + __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq)); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +template <> +struct Cmp_SIMD +{ + explicit Cmp_SIMD(int code_) : + code(code_) + { + // CV_Assert(code == CMP_GT || code == CMP_LE || + // code == CMP_EQ || code == CMP_NE); + + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + + v_mask = _mm_set1_epi32(0xffffffff); + } + + int operator () (const int * src1, const int * src2, uchar * dst, int width) const + { + int x = 0; + + if (!haveSSE) + return x; + + if (code == CMP_GT) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_LE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask)); + } + else if (code == CMP_EQ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)); + } + else if (code == CMP_NE) + for ( ; x <= width - 8; x += 8) + { + __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)), + _mm_loadu_si128((const __m128i *)(src2 + x))); + __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)), + _mm_loadu_si128((const __m128i *)(src2 + x + 4))); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask))); + } + + return x; + } + + int code; + __m128i v_mask; + bool haveSSE; +}; + +#endif + + +template +struct Mul_SIMD +{ + int operator() (const T *, const T *, T *, int, WT) const + { + return 0; + } +}; + +#if CV_NEON + +template <> +struct Mul_SIMD +{ + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); + int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1_s8(dst + x, vqmovn_s16(v_dst)); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), + vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); + vst1q_u16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); + v_dst1 = vmulq_f32(v_dst1, v_scale); + float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), + vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); + vst1q_s16(dst + x, v_dst); + } + } + + return x; + } +}; + +template <> +struct Mul_SIMD +{ + int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const + { + int x = 0; + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + else + { + float32x4_t v_scale = vdupq_n_f32(scale); + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); + v_dst1 = vmulq_f32(v_dst1, v_scale); + + float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); + v_dst2 = vmulq_f32(v_dst2, v_scale); + + vst1q_f32(dst + x, v_dst1); + vst1q_f32(dst + x + 4, v_dst2); + } + } + + return x; + } +}; + +#elif CV_SSE2 + +#if CV_SSE4_1 + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale != 1.0f ) + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), + _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), + _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storeu_si128((__m128i *)(dst + x), v_dsti); + } + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale == 1.0f ) + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); + + v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); + } + else + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); + + v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); + } + } + + return x; + } + + bool haveSSE; +}; + +template <> +struct Mul_SIMD +{ + Mul_SIMD() + { + haveSSE = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const + { + int x = 0; + + if (!haveSSE) + return x; + + __m128i v_zero = _mm_setzero_si128(); + + if( scale != 1.0f ) + { + __m128 v_scale = _mm_set1_ps(scale); + for ( ; x <= width - 8; x += 8) + { + __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); + + __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); + v_dst1 = _mm_mul_ps(v_dst1, v_scale); + + __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), + _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); + v_dst2 = _mm_mul_ps(v_dst2, v_scale); + + __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); + _mm_storeu_si128((__m128i *)(dst + x), v_dsti); + } + } + + return x; + } + + bool haveSSE; +}; + +#endif + +template +struct Div_SIMD +{ + int operator() (const T *, const T *, T *, int, double) const + { + return 0; + } +}; + +template +struct Recip_SIMD +{ + int operator() (const T *, T *, int, double) const + { + return 0; + } +}; + + +#if CV_SIMD128 + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src1 = v_load_expand(src1 + x); + v_uint16x8 v_src2 = v_load_expand(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src1 = v_load_expand(src1 + x); + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src1 = v_load(src1 + x); + v_uint16x8 v_src2 = v_load(src2 + x); + + v_uint32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); + v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src1 = v_load(src1 + x); + v_int16x8 v_src2 = v_load(src2 + x); + + v_int32x4 t0, t1, t2, t3; + v_expand(v_src1, t0, t1); + v_expand(v_src2, t2, t3); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src1 + x); + v_int32x4 t1 = v_load(src1 + x + 4); + v_int32x4 t2 = v_load(src2 + x); + v_int32x4 t3 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + v_float32x4 f2 = v_cvt_f32(t2); + v_float32x4 f3 = v_cvt_f32(t3); + + f0 = f0 * v_scale / f2; + f1 = f1 * v_scale / f3; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t2 == v_zero, v_zero, res0); + res1 = v_select(t3 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src1 + x); + v_float32x4 f1 = v_load(src1 + x + 4); + v_float32x4 f2 = v_load(src2 + x); + v_float32x4 f3 = v_load(src2 + x + 4); + + v_float32x4 res0 = f0 * v_scale / f2; + v_float32x4 res1 = f1 * v_scale / f3; + + res0 = v_select(f2 == v_zero, v_zero, res0); + res1 = v_select(f3 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +///////////////////////// RECIPROCAL ////////////////////// + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const uchar * src2, uchar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src2 = v_load_expand(src2 + x); + + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const schar * src2, schar * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src2 = v_load_expand(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_pack_store(dst + x, res); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const ushort * src2, ushort * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_uint16x8 v_zero = v_setzero_u16(); + + for ( ; x <= width - 8; x += 8) + { + v_uint16x8 v_src2 = v_load(src2 + x); + + v_uint32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); + v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_uint16x8 res = v_pack_u(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const short * src2, short * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int16x8 v_zero = v_setzero_s16(); + + for ( ; x <= width - 8; x += 8) + { + v_int16x8 v_src2 = v_load(src2 + x); + + v_int32x4 t0, t1; + v_expand(v_src2, t0, t1); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 i0 = v_round(f0), i1 = v_round(f1); + v_int16x8 res = v_pack(i0, i1); + + res = v_select(v_src2 == v_zero, v_zero, res); + v_store(dst + x, res); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const int * src2, int * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_int32x4 v_zero = v_setzero_s32(); + + for ( ; x <= width - 8; x += 8) + { + v_int32x4 t0 = v_load(src2 + x); + v_int32x4 t1 = v_load(src2 + x + 4); + + v_float32x4 f0 = v_cvt_f32(t0); + v_float32x4 f1 = v_cvt_f32(t1); + + f0 = v_scale / f0; + f1 = v_scale / f1; + + v_int32x4 res0 = v_round(f0), res1 = v_round(f1); + + res0 = v_select(t0 == v_zero, v_zero, res0); + res1 = v_select(t1 == v_zero, v_zero, res1); + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const float * src2, float * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float32x4 v_scale = v_setall_f32((float)scale); + v_float32x4 v_zero = v_setzero_f32(); + + for ( ; x <= width - 8; x += 8) + { + v_float32x4 f0 = v_load(src2 + x); + v_float32x4 f1 = v_load(src2 + x + 4); + + v_float32x4 res0 = v_scale / f0; + v_float32x4 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 4, res1); + } + + return x; + } +}; + +#if CV_SIMD128_64F + +template <> +struct Div_SIMD +{ + bool haveSIMD; + Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); + + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src1 + x); + v_float64x2 f1 = v_load(src1 + x + 2); + v_float64x2 f2 = v_load(src2 + x); + v_float64x2 f3 = v_load(src2 + x + 2); + + v_float64x2 res0 = f0 * v_scale / f2; + v_float64x2 res1 = f1 * v_scale / f3; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +template <> +struct Recip_SIMD +{ + bool haveSIMD; + Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } + + int operator() (const double * src2, double * dst, int width, double scale) const + { + int x = 0; + + if (!haveSIMD) + return x; + + v_float64x2 v_scale = v_setall_f64(scale); + v_float64x2 v_zero = v_setzero_f64(); + + for ( ; x <= width - 4; x += 4) + { + v_float64x2 f0 = v_load(src2 + x); + v_float64x2 f1 = v_load(src2 + x + 2); + + v_float64x2 res0 = v_scale / f0; + v_float64x2 res1 = v_scale / f1; + + res0 = v_select(f0 == v_zero, v_zero, res0); + res1 = v_select(f1 == v_zero, v_zero, res1); + + v_store(dst + x, res0); + v_store(dst + x + 2, res1); + } + + return x; + } +}; + +#endif + +#endif + + +template +struct AddWeighted_SIMD +{ + int operator() (const T *, const T *, T *, int, WT, WT, WT) const + { + return 0; + } +}; + +#if CV_SSE2 + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); + + __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); + __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); + + __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1)); + + _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); + } + + return x; + } + + bool haveSSE2; +}; + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE2) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1))); + } + + return x; + } + + bool haveSSE2; +}; + +#if CV_SSE4_1 + +template <> +struct AddWeighted_SIMD +{ + AddWeighted_SIMD() + { + haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); + } + + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + if (!haveSSE4_1) + return x; + + __m128i v_zero = _mm_setzero_si128(); + __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), + v_gamma = _mm_set1_ps(gamma); + + for( ; x <= width - 8; x += 8 ) + { + __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); + __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); + + __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); + v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); + + __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); + v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), + _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); + + _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), + _mm_cvtps_epi32(v_dstf1))); + } + + return x; + } + + bool haveSSE4_1; +}; + +#endif + +#elif CV_NEON + +template <> +struct AddWeighted_SIMD +{ + int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32 (gamma); + + for( ; x <= width - 8; x += 8 ) + { + int8x8_t in1 = vld1_s8(src1 + x); + int16x8_t in1_16 = vmovl_s8(in1); + float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); + float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); + + int8x8_t in2 = vld1_s8(src2+x); + int16x8_t in2_16 = vmovl_s8(in2); + float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); + float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); + + float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); + float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); + out_f_l = vaddq_f32(out_f_l, g); + out_f_h = vaddq_f32(out_f_h, g); + + int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); + int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); + + int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); + int8x8_t out = vqmovn_s16(out_16); + + vst1_s8(dst + x, out); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); + uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); + uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); + } + + return x; + } +}; + +template <> +struct AddWeighted_SIMD +{ + int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const + { + int x = 0; + + float32x4_t g = vdupq_n_f32(gamma); + + for( ; x <= width - 8; x += 8 ) + { + int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); + + float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); + float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); + int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); + v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); + int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); + + vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); + } + + return x; + } +}; + +#endif + +}} + +#endif // __OPENCV_HAL_ARITHM_SIMD_HPP__ diff --git a/modules/hal/src/hardware.cpp b/modules/hal/src/hardware.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a08b9f44af965c192174ebc06b363e1fab8a7ef --- /dev/null +++ b/modules/hal/src/hardware.cpp @@ -0,0 +1,221 @@ +#include "precomp.hpp" + +#if defined WIN32 || defined _WIN32 || defined WINCE +#include +#if defined _MSC_VER + #if _MSC_VER >= 1400 + #include + #elif defined _M_IX86 + static void __cpuid(int* cpuid_data, int) + { + __asm + { + push ebx + push edi + mov edi, cpuid_data + mov eax, 1 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + pop ebx + } + } + static void __cpuidex(int* cpuid_data, int, int) + { + __asm + { + push edi + mov edi, cpuid_data + mov eax, 7 + mov ecx, 0 + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + pop edi + } + } + #endif +#endif +#endif + +#if defined ANDROID || defined __linux__ +# include +# include +# include +# include +#endif + +#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__ +#include +#include +#include +#if defined ANDROID +#include +#endif +#endif + +#ifdef ANDROID +# include +#endif + +struct HWFeatures +{ + enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; + + HWFeatures(void) + { + memset( have, 0, sizeof(have) ); + x86_family = 0; + } + + static HWFeatures initialize(void) + { + HWFeatures f; + int cpuid_data[4] = { 0, 0, 0, 0 }; + + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuid(cpuid_data, 1); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $1, %%eax\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%ebx\n\t" + "movl $1,%%eax\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) + : + : "cc" + ); + #endif + #endif + + f.x86_family = (cpuid_data[0] >> 8) & 15; + if( f.x86_family >= 6 ) + { + f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0; + f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0; + f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; + f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; + f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; + f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; + f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; + f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; + f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; + f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX + + // make the second call to the cpuid command in order to get + // information about extended features like AVX2 + #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + __cpuidex(cpuid_data, 7, 0); + #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #ifdef __x86_64__ + asm __volatile__ + ( + "movl $7, %%eax\n\t" + "movl $0, %%ecx\n\t" + "cpuid\n\t" + :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) + : + : "cc" + ); + #else + asm volatile + ( + "pushl %%ebx\n\t" + "movl $7,%%eax\n\t" + "movl $0,%%ecx\n\t" + "cpuid\n\t" + "movl %%ebx, %0\n\t" + "popl %%ebx\n\t" + : "=r"(cpuid_data[1]), "=c"(cpuid_data[2]) + : + : "cc" + ); + #endif + #endif + f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; + + f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; + f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; + f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; + f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; + f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; + f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; + f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; + f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; + f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; + } + + #if defined ANDROID || defined __linux__ + #ifdef __aarch64__ + f.have[CV_CPU_NEON] = true; + #else + int cpufile = open("/proc/self/auxv", O_RDONLY); + + if (cpufile >= 0) + { + Elf32_auxv_t auxv; + const size_t size_auxv_t = sizeof(auxv); + + while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) + { + if (auxv.a_type == AT_HWCAP) + { + f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; + break; + } + } + + close(cpufile); + } + #endif + #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) + f.have[CV_CPU_NEON] = true; + #endif + + return f; + } + + int x86_family; + bool have[MAX_FEATURE+1]; +}; + +static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures(); +static HWFeatures* currentFeatures = &featuresEnabled; +volatile bool useOptimizedFlag = true; + +namespace cv { namespace hal { + +bool checkHardwareSupport(int feature) +{ +// CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE ); + return currentFeatures->have[feature]; +} + +void setUseOptimized( bool flag ) +{ + useOptimizedFlag = flag; + currentFeatures = flag ? &featuresEnabled : &featuresDisabled; +} + +bool useOptimized(void) +{ + return useOptimizedFlag; +} + +}} diff --git a/modules/hal/src/merge.cpp b/modules/hal/src/merge.cpp new file mode 100644 index 0000000000000000000000000000000000000000..982b24c2505cffaf33c02e46d23da62a4b93d14c --- /dev/null +++ b/modules/hal/src/merge.cpp @@ -0,0 +1,408 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +#if CV_NEON +template struct VMerge2; +template struct VMerge3; +template struct VMerge4; + +#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + store_func(dst, r); \ + } \ + } + +#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + store_func(dst, r); \ + } \ + } + +#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name{ \ + void operator()(const data_type* src0, const data_type* src1, \ + const data_type* src2, const data_type* src3, \ + data_type* dst){ \ + reg_type r; \ + r.val[0] = load_func(src0); \ + r.val[1] = load_func(src1); \ + r.val[2] = load_func(src2); \ + r.val[3] = load_func(src3); \ + store_func(dst, r); \ + } \ + } + +MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); +MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); +MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); +MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); + +MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); +MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); +MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); +MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); + +MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); +MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); +MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); +MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); + +#elif CV_SSE2 + +template +struct VMerge2 +{ + VMerge2() : support(false) { } + void operator()(const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge3 +{ + VMerge3() : support(false) { } + void operator()(const T *, const T *, const T *, T *) const { } + + bool support; +}; + +template +struct VMerge4 +{ + VMerge4() : support(false) { } + void operator()(const T *, const T *, const T *, const T *, T *) const { } + + bool support; +}; + +#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge2() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + } \ + \ + bool support; \ +} + +#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge3() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + } \ + \ + bool support; \ +} + +#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ +template <> \ +struct VMerge4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VMerge4() \ + { \ + support = checkHardwareSupport(se); \ + } \ + \ + void operator()(const data_type * src0, const data_type * src1, \ + const data_type * src2, const data_type * src3, \ + data_type * dst) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ + reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ + reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ + reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ + reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ + reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ + reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ + \ + _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ + } \ + \ + bool support; \ +} + +MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); + +#if CV_SSE4_1 +MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); +#endif + +MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); +MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); + +#endif + +template static void +merge_( const T** src, T* dst, int len, int cn ) +{ + int k = cn % 4 ? cn % 4 : 4; + int i, j; + if( k == 1 ) + { + const T* src0 = src[0]; + for( i = j = 0; i < len; i++, j += cn ) + dst[j] = src0[i]; + } + else if( k == 2 ) + { + const T *src0 = src[0], *src1 = src[1]; + i = j = 0; +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VMerge2 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; + dst[j+1] = src1[i]; + } + } + else if( k == 3 ) + { + const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; + i = j = 0; +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VMerge3 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; + dst[j+1] = src1[i]; + dst[j+2] = src2[i]; + } + } + else + { + const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; + i = j = 0; +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } +#elif CV_SSE2 + if(cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VMerge4 vmerge; + if (vmerge.support) + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); + } +#endif + for( ; i < len; i++, j += cn ) + { + dst[j] = src0[i]; dst[j+1] = src1[i]; + dst[j+2] = src2[i]; dst[j+3] = src3[i]; + } + } + + for( ; k < cn; k += 4 ) + { + const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; + for( i = 0, j = k; i < len; i++, j += cn ) + { + dst[j] = src0[i]; dst[j+1] = src1[i]; + dst[j+2] = src2[i]; dst[j+3] = src3[i]; + } + } +} + + +void merge8u(const uchar** src, uchar* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge16u(const ushort** src, ushort* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge32s(const int** src, int* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +void merge64s(const int64** src, int64* dst, int len, int cn ) +{ + merge_(src, dst, len, cn); +} + +}} diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp index 630565bec3851e0b5780c3c44d3fac006bb7a1c9..16586368e4aadaa7ff9691149190a82359804afe 100644 --- a/modules/hal/src/precomp.hpp +++ b/modules/hal/src/precomp.hpp @@ -47,3 +47,14 @@ #include #include #include +#include +#include + +#include "opencv2/hal/sse_utils.hpp" +#include "opencv2/hal/neon_utils.hpp" + +#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700) +#define ARITHM_USE_IPP 1 +#else +#define ARITHM_USE_IPP 0 +#endif diff --git a/modules/hal/src/replacement.hpp b/modules/hal/src/replacement.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c8cc19224e5ea595e4b708ebeae8707d14a3c239 --- /dev/null +++ b/modules/hal/src/replacement.hpp @@ -0,0 +1,208 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_HAL_REPLACEMENT_HPP__ +#define __OPENCV_HAL_REPLACEMENT_HPP__ + +#include "opencv2/hal.hpp" + +inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } + +#define hal_add8u hal_t_add8u +#define hal_add8s hal_t_add8s +#define hal_add16u hal_t_add16u +#define hal_add16s hal_t_add16s +#define hal_add32s hal_t_add32s +#define hal_add32f hal_t_add32f +#define hal_add64f hal_t_add64f +#define hal_sub8u hal_t_sub8u +#define hal_sub8s hal_t_sub8s +#define hal_sub16u hal_t_sub16u +#define hal_sub16s hal_t_sub16s +#define hal_sub32s hal_t_sub32s +#define hal_sub32f hal_t_sub32f +#define hal_sub64f hal_t_sub64f +#define hal_max8u hal_t_max8u +#define hal_max8s hal_t_max8s +#define hal_max16u hal_t_max16u +#define hal_max16s hal_t_max16s +#define hal_max32s hal_t_max32s +#define hal_max32f hal_t_max32f +#define hal_max64f hal_t_max64f +#define hal_min8u hal_t_min8u +#define hal_min8s hal_t_min8s +#define hal_min16u hal_t_min16u +#define hal_min16s hal_t_min16s +#define hal_min32s hal_t_min32s +#define hal_min32f hal_t_min32f +#define hal_min64f hal_t_min64f +#define hal_absdiff8u hal_t_absdiff8u +#define hal_absdiff8s hal_t_absdiff8s +#define hal_absdiff16u hal_t_absdiff16u +#define hal_absdiff16s hal_t_absdiff16s +#define hal_absdiff32s hal_t_absdiff32s +#define hal_absdiff32f hal_t_absdiff32f +#define hal_absdiff64f hal_t_absdiff64f +#define hal_and8u hal_t_and8u +#define hal_or8u hal_t_or8u +#define hal_xor8u hal_t_xor8u +#define hal_not8u hal_t_not8u + +inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } +inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } + +#define hal_cmp8u hal_t_cmp8u +#define hal_cmp8s hal_t_cmp8s +#define hal_cmp16u hal_t_cmp16u +#define hal_cmp16s hal_t_cmp16s +#define hal_cmp32s hal_t_cmp32s +#define hal_cmp32f hal_t_cmp32f +#define hal_cmp64f hal_t_cmp64f + +inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } +inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } + +#define hal_mul8u hal_t_mul8u +#define hal_mul8s hal_t_mul8s +#define hal_mul16u hal_t_mul16u +#define hal_mul16s hal_t_mul16s +#define hal_mul32s hal_t_mul32s +#define hal_mul32f hal_t_mul32f +#define hal_mul64f hal_t_mul64f +#define hal_div8u hal_t_div8u +#define hal_div8s hal_t_div8s +#define hal_div16u hal_t_div16u +#define hal_div16s hal_t_div16s +#define hal_div32s hal_t_div32s +#define hal_div32f hal_t_div32f +#define hal_div64f hal_t_div64f +#define hal_recip8u hal_t_recip8u +#define hal_recip8s hal_t_recip8s +#define hal_recip16u hal_t_recip16u +#define hal_recip16s hal_t_recip16s +#define hal_recip32s hal_t_recip32s +#define hal_recip32f hal_t_recip32f +#define hal_recip64f hal_t_recip64f + +inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } +inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } + +#define hal_addWeighted8u hal_t_addWeighted8u +#define hal_addWeighted8s hal_t_addWeighted8s +#define hal_addWeighted16u hal_t_addWeighted16u +#define hal_addWeighted16s hal_t_addWeighted16s +#define hal_addWeighted32s hal_t_addWeighted32s +#define hal_addWeighted32f hal_t_addWeighted32f +#define hal_addWeighted64f hal_t_addWeighted64f + +#include "custom_hal.hpp" + +#endif diff --git a/modules/hal/src/split.cpp b/modules/hal/src/split.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c31bf8cc44e9cfc2b3c7b73106c89ab8c4bf6bc9 --- /dev/null +++ b/modules/hal/src/split.cpp @@ -0,0 +1,424 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace hal { + +#if CV_NEON +template struct VSplit2; +template struct VSplit3; +template struct VSplit4; + +#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, \ + data_type* dst1) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + } \ + } + +#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + } \ + } + +#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ + template<> \ + struct name \ + { \ + void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ + data_type* dst2, data_type* dst3) const \ + { \ + reg_type r = load_func(src); \ + store_func(dst0, r.val[0]); \ + store_func(dst1, r.val[1]); \ + store_func(dst2, r.val[2]); \ + store_func(dst3, r.val[3]); \ + } \ + } + +SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); +SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); +SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); + +SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); +SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); +SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); + +SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); +SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); +SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); + +#elif CV_SSE2 + +template +struct VSplit2 +{ + VSplit2() : support(false) { } + void operator()(const T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit3 +{ + VSplit3() : support(false) { } + void operator()(const T *, T *, T *, T *) const { } + + bool support; +}; + +template +struct VSplit4 +{ + VSplit4() : support(false) { } + void operator()(const T *, T *, T *, T *, T *) const { } + + bool support; +}; + +#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit2 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit2() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + } \ + \ + bool support; \ +} + +#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit3 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit3() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, \ + data_type * dst0, data_type * dst1, data_type * dst2) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, \ + v_src3, v_src4, v_src5); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + } \ + \ + bool support; \ +} + +#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ +template <> \ +struct VSplit4 \ +{ \ + enum \ + { \ + ELEMS_IN_VEC = 16 / sizeof(data_type) \ + }; \ + \ + VSplit4() \ + { \ + support = checkHardwareSupport(CV_CPU_SSE2); \ + } \ + \ + void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ + data_type * dst2, data_type * dst3) const \ + { \ + reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ + reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ + reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ + reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ + reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ + reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ + reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ + reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ + \ + _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ + v_src4, v_src5, v_src6, v_src7); \ + \ + _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ + _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ + _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ + _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ + _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ + _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ + _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ + _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ + } \ + \ + bool support; \ +} + +SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); +SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); +SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); + +#endif + +template static void +split_( const T* src, T** dst, int len, int cn ) +{ + int k = cn % 4 ? cn % 4 : 4; + int i, j; + if( k == 1 ) + { + T* dst0 = dst[0]; + + if(cn == 1) + { + memcpy(dst0, src, len * sizeof(T)); + } + else + { + for( i = 0, j = 0 ; i < len; i++, j += cn ) + dst0[i] = src[j]; + } + } + else if( k == 2 ) + { + T *dst0 = dst[0], *dst1 = dst[1]; + i = j = 0; + +#if CV_NEON + if(cn == 2) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + for( ; i < len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } +#elif CV_SSE2 + if (cn == 2) + { + int inc_i = 32/sizeof(T); + int inc_j = 2 * inc_i; + + VSplit2 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + } + } + else if( k == 3 ) + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; + i = j = 0; + +#if CV_NEON + if(cn == 3) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } +#elif CV_SSE2 + if (cn == 3) + { + int inc_i = 32/sizeof(T); + int inc_j = 3 * inc_i; + + VSplit3 vsplit; + + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + dst2[i] = src[j+2]; + } + } + else + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; + i = j = 0; + +#if CV_NEON + if(cn == 4) + { + int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } +#elif CV_SSE2 + if (cn == 4) + { + int inc_i = 32/sizeof(T); + int inc_j = 4 * inc_i; + + VSplit4 vsplit; + if (vsplit.support) + { + for( ; i <= len - inc_i; i += inc_i, j += inc_j) + vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); + } + } +#endif + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } + + for( ; k < cn; k += 4 ) + { + T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; + for( i = 0, j = k; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } +} + +void split8u(const uchar* src, uchar** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split16u(const ushort* src, ushort** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split32s(const int* src, int** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +void split64s(const int64* src, int64** dst, int len, int cn ) +{ + split_(src, dst, len, cn); +} + +}} diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp index 7a0cece2f27c5b78810143f0f53d19409d239eeb..3bb8d8e760f6a2cf65c68c02dfdf52bf10231b82 100644 --- a/modules/imgproc/src/precomp.hpp +++ b/modules/imgproc/src/precomp.hpp @@ -94,4 +94,6 @@ extern const float icv8x32fSqrTab[]; #include "_geom.h" #include "filterengine.hpp" +#include "opencv2/hal/sse_utils.hpp" + #endif /*__OPENCV_CV_INTERNAL_H_*/