diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 5a105a7a41872cf15991c00c9abfdc66ce9c18c5..52f6b5d55200debeda848c81845ba88299eb2078 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -99,6 +99,7 @@ enum StoreMode } +// TODO FIXIT: Don't use "God" traits. Split on separate cases. template struct V_TypeTraits { }; @@ -130,21 +131,51 @@ template struct V_TypeTraits } \ } +#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_, nlanes128_) \ + template<> struct V_TypeTraits \ + { \ + typedef type value_type; \ + typedef int_type_ int_type; \ + typedef abs_type_ abs_type; \ + typedef uint_type_ uint_type; \ + typedef w_type_ w_type; \ + typedef sum_type_ sum_type; \ + enum { nlanes128 = nlanes128_ }; \ + \ + static inline int_type reinterpret_int(type x) \ + { \ + union { type l; int_type i; } v; \ + v.l = x; \ + return v.i; \ + } \ + \ + static inline type reinterpret_from_int(int_type x) \ + { \ + union { type l; int_type i; } v; \ + v.i = x; \ + return v.l; \ + } \ + } + CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned, 16); CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int, 16); CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned, 8); CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int, 8); -CV_INTRIN_DEF_TYPE_TRAITS(unsigned, int, unsigned, unsigned, uint64, void, unsigned, 4); -CV_INTRIN_DEF_TYPE_TRAITS(int, int, unsigned, unsigned, int64, void, int, 4); -CV_INTRIN_DEF_TYPE_TRAITS(float, int, unsigned, float, double, void, float, 4); -CV_INTRIN_DEF_TYPE_TRAITS(uint64, int64, uint64, uint64, void, void, uint64, 2); -CV_INTRIN_DEF_TYPE_TRAITS(int64, int64, uint64, uint64, void, void, int64, 2); -CV_INTRIN_DEF_TYPE_TRAITS(double, int64, uint64, double, void, void, double, 2); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned, 4); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int, 4); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float, 4); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64, 2); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64, 2); +CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double, 2); #ifndef CV_DOXYGEN #ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE -#ifdef CV_CPU_DISPATCH_MODE +#ifdef CV_FORCE_SIMD128_CPP + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP { + #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } +#elif defined(CV_CPU_DISPATCH_MODE) #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) { #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } @@ -197,7 +228,6 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #else -#define CV_SIMD128_CPP 1 #include "opencv2/core/hal/intrin_cpp.hpp" #endif @@ -242,6 +272,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD128 0 #endif +#ifndef CV_SIMD128_CPP +#define CV_SIMD128_CPP 0 +#endif + #ifndef CV_SIMD128_64F #define CV_SIMD128_64F 0 #endif @@ -346,7 +380,7 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void); CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void); CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void); -#if CV_SIMD128_64F +#if CV_SIMD128_64F || CV_SIMD128_CPP CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4); #else CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4); @@ -433,7 +467,11 @@ namespace CV__SIMD_NAMESPACE { } // namespace using namespace CV__SIMD_NAMESPACE; #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128) +#if defined CV_SIMD128_CPP +#define CV__SIMD_NAMESPACE simd128_cpp +#else #define CV__SIMD_NAMESPACE simd128 +#endif namespace CV__SIMD_NAMESPACE { #define CV_SIMD CV_SIMD128 #define CV_SIMD_64F CV_SIMD128_64F diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 15ae380e65f05fd2843c2218768919920d5510cc..d9719b7fa0bbd4a64585cca298cc2a2f7716d627 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -50,6 +50,14 @@ #include #include "opencv2/core/saturate.hpp" +//! @cond IGNORED +#define CV_SIMD128_CPP 1 +#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN) +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 +#endif +//! @endcond + namespace cv { @@ -135,7 +143,7 @@ Element-wise binary and unary operations. @ref v_shl, @ref v_shr - Bitwise logic: -@ref operator&(const v_reg &a, const v_reg &b) "&", +@ref operator &(const v_reg &a, const v_reg &b) "&", @ref operator |(const v_reg &a, const v_reg &b) "|", @ref operator ^(const v_reg &a, const v_reg &b) "^", @ref operator ~(const v_reg &a) "~" @@ -402,50 +410,102 @@ typedef v_reg v_uint64x2; /** @brief Two 64-bit signed integer values */ typedef v_reg v_int64x2; -//! @brief Helper macro -//! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ -template inline v_reg<_Tp, n> \ - operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - v_reg<_Tp, n> c; \ - for( int i = 0; i < n; i++ ) \ - c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return c; \ -} \ -template inline v_reg<_Tp, n>& \ - operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ -{ \ - for( int i = 0; i < n; i++ ) \ - a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ - return a; \ -} - /** @brief Add values For all types. */ -OPENCV_HAL_IMPL_BIN_OP(+) +template CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Subtract values For all types. */ -OPENCV_HAL_IMPL_BIN_OP(-) +template CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Multiply values For 16- and 32-bit integer types and floating types. */ -OPENCV_HAL_IMPL_BIN_OP(*) +template CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); /** @brief Divide values For floating types only. */ -OPENCV_HAL_IMPL_BIN_OP(/) +template CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); -//! @brief Helper macro -//! @ingroup core_hal_intrin_impl -#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ -template inline v_reg<_Tp, n> operator bit_op \ - (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ + +/** @brief Bitwise AND + +Only for integer types. */ +template CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +/** @brief Bitwise OR + +Only for integer types. */ +template CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +/** @brief Bitwise XOR + +Only for integer types.*/ +template CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); +template CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b); + +/** @brief Bitwise NOT + +Only for integer types.*/ +template CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a); + + +#ifndef CV_DOXYGEN + +#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \ +__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(short, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(int, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \ + +#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \ +__CV_EXPAND(macro_name(float, __VA_ARGS__)) \ +__CV_EXPAND(macro_name(double, __VA_ARGS__)) \ + +#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \ + +#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \ +template inline \ +v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return c; \ +} \ +template inline \ +v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +{ \ + for( int i = 0; i < n; i++ ) \ + a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ + return a; \ +} + +#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op) + +CV__HAL_INTRIN_IMPL_BIN_OP(+) +CV__HAL_INTRIN_IMPL_BIN_OP(-) +CV__HAL_INTRIN_IMPL_BIN_OP(*) +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /) + +#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \ +template CV_INLINE \ +v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ v_reg<_Tp, n> c; \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ @@ -454,8 +514,8 @@ template inline v_reg<_Tp, n> operator bit_op \ V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ return c; \ } \ -template inline v_reg<_Tp, n>& operator \ - bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ +template CV_INLINE \ +v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ { \ typedef typename V_TypeTraits<_Tp>::int_type itype; \ for( int i = 0; i < n; i++ ) \ @@ -464,33 +524,29 @@ template inline v_reg<_Tp, n>& operator \ return a; \ } -/** @brief Bitwise AND +#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \ +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \ +CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */ -Only for integer types. */ -OPENCV_HAL_IMPL_BIT_OP(&) -/** @brief Bitwise OR - -Only for integer types. */ -OPENCV_HAL_IMPL_BIT_OP(|) +CV__HAL_INTRIN_IMPL_BIT_OP(&) +CV__HAL_INTRIN_IMPL_BIT_OP(|) +CV__HAL_INTRIN_IMPL_BIT_OP(^) -/** @brief Bitwise XOR +#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \ +template CV_INLINE \ +v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \ +{ \ + v_reg<_Tp, n> c; \ + for( int i = 0; i < n; i++ ) \ + c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \ + return c; \ +} \ -Only for integer types.*/ -OPENCV_HAL_IMPL_BIT_OP(^) +CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~) -/** @brief Bitwise NOT +#endif // !CV_DOXYGEN -Only for integer types.*/ -template inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) -{ - v_reg<_Tp, n> c; - for( int i = 0; i < n; i++ ) - { - c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); - } - return c; -} //! @brief Helper macro //! @ingroup core_hal_intrin_impl @@ -503,6 +559,27 @@ template inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) return c; \ } +//! @brief Helper macro +//! @ingroup core_hal_intrin_impl +#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \ +inline v_reg func(const v_reg& a) \ +{ \ + v_reg c; \ + for( int i = 0; i < 4; i++ ) \ + c.s[i] = cfunc(a.s[i]); \ + return c; \ +} \ +inline v_reg func(const v_reg& a) \ +{ \ + v_reg c; \ + for( int i = 0; i < 2; i++ ) \ + { \ + c.s[i] = cfunc(a.s[i]); \ + c.s[i + 2] = 0; \ + } \ + return c; \ +} + /** @brief Square root of elements Only for floating point types.*/ @@ -524,22 +601,22 @@ OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, /** @brief Round elements Only for floating point types.*/ -OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_round, cvRound) /** @brief Floor elements Only for floating point types.*/ -OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_floor, cvFloor) /** @brief Ceil elements Only for floating point types.*/ -OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_ceil, cvCeil) /** @brief Truncate elements Only for floating point types.*/ -OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) +OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(v_trunc, int) //! @brief Helper macro //! @ingroup core_hal_intrin_impl @@ -1083,9 +1160,8 @@ OPENCV_HAL_IMPL_SHIFT_OP(<< ) For 16-, 32- and 64-bit integer values. */ OPENCV_HAL_IMPL_SHIFT_OP(>> ) -/** @brief Element shift left among vector - -For all type */ +//! @brief Helper macro +//! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \ template inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \ { \ @@ -1127,7 +1203,14 @@ template inline v_reg<_Tp, n> v_rotate_##suffix(co return c; \ } +/** @brief Element shift left among vector + +For all type */ OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +) + +/** @brief Element shift right among vector + +For all type */ OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -) /** @brief Sum packed values @@ -1389,6 +1472,7 @@ similar to cv::v_load, but source memory block should be aligned (to 16-byte bou template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr) { + CV_Assert(isAligned::nlanes128>)>(ptr)); return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr); } @@ -1620,6 +1704,12 @@ inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) ptr[i] = a.s[i]; } +template +inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) +{ + v_store(ptr, a); +} + /** @brief Store data to memory (lower half) Store lower half of register contents to memory. @@ -1659,22 +1749,22 @@ Pointer __should__ be aligned by 16-byte boundary. */ template inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) { - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); } template inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a) { - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); } template inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/) { - for( int i = 0; i < n; i++ ) - ptr[i] = a.s[i]; + CV_Assert(isAligned)>(ptr)); + v_store(ptr, a); } /** @brief Combine vector from first elements of two vectors @@ -1940,6 +2030,17 @@ template inline v_reg v_cvt_f32(const v_reg& a) return c; } +template inline v_reg v_cvt_f32(const v_reg& a) +{ + v_reg c; + for( int i = 0; i < n; i++ ) + { + c.s[i] = (float)a.s[i]; + c.s[i+n] = 0; + } + return c; +} + template inline v_reg v_cvt_f32(const v_reg& a, const v_reg& b) { v_reg c; @@ -1954,36 +2055,76 @@ template inline v_reg v_cvt_f32(const v_reg& a, co /** @brief Convert to double Supported input type is cv::v_int32x4. */ -template inline v_reg v_cvt_f64(const v_reg& a) +CV_INLINE v_reg v_cvt_f64(const v_reg& a) { + enum { n = 2 }; v_reg c; for( int i = 0; i < n; i++ ) c.s[i] = (double)a.s[i]; return c; } +/** @brief Convert to double high part of vector + +Supported input type is cv::v_int32x4. */ +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i + 2]; + return c; +} + /** @brief Convert to double Supported input type is cv::v_float32x4. */ -template inline v_reg v_cvt_f64(const v_reg& a) +CV_INLINE v_reg v_cvt_f64(const v_reg& a) { + enum { n = 2 }; v_reg c; for( int i = 0; i < n; i++ ) c.s[i] = (double)a.s[i]; return c; } +/** @brief Convert to double high part of vector + +Supported input type is cv::v_float32x4. */ +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i + 2]; + return c; +} + /** @brief Convert to double Supported input type is cv::v_int64x2. */ -template inline v_reg v_cvt_f64(const v_reg& a) +CV_INLINE v_reg v_cvt_f64(const v_reg& a) { + enum { n = 2 }; v_reg c; for( int i = 0; i < n; i++ ) c.s[i] = (double)a.s[i]; return c; } +/** @brief Convert to double high part of vector + +Supported input type is cv::v_int64x2. */ +CV_INLINE v_reg v_cvt_f64_high(const v_reg& a) +{ + enum { n = 2 }; + v_reg c; + for( int i = 0; i < n; i++ ) + c.s[i] = (double)a.s[i]; + return c; +} + + template inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx) { v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c; @@ -2038,6 +2179,28 @@ template inline v_reg v_lut(const double* tab, const v_reg inline void v_lut_deinterleave(const float* tab, const v_reg& idx, v_reg& x, v_reg& y) { @@ -2062,7 +2225,7 @@ template inline void v_lut_deinterleave(const double* tab, const v_reg inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec) { - v_reg c; + v_reg<_Tp, n> c; for (int i = 0; i < n/4; i++) { c.s[4*i ] = vec.s[4*i ]; @@ -2075,7 +2238,7 @@ template inline v_reg<_Tp, n> v_interleave_pairs(const v_re template inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec) { - v_reg c; + v_reg<_Tp, n> c; for (int i = 0; i < n/8; i++) { c.s[8*i ] = vec.s[8*i ]; @@ -2092,7 +2255,7 @@ template inline v_reg<_Tp, n> v_interleave_quads(const v_re template inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec) { - v_reg c; + v_reg<_Tp, n> c; for (int i = 0; i < n/4; i++) { c.s[3*i ] = vec.s[4*i ]; @@ -2523,6 +2686,17 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]); } + +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); } + +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + ////// FP16 support /////// inline v_reg::nlanes128> @@ -2537,7 +2711,7 @@ v_load_expand(const float16_t* ptr) } inline void -v_pack_store(float16_t* ptr, v_reg::nlanes128>& v) +v_pack_store(float16_t* ptr, const v_reg::nlanes128>& v) { for( int i = 0; i < v.nlanes; i++ ) { diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 3d1376dc8b2ba9f6002e61a7ae65b1696d49bc1d..760bbcb088dce362523d67c5e51c2c860c88b2c0 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1522,7 +1522,8 @@ struct InRange_SIMD v_float32 low2 = vx_load(src2 + x + v_float32::nlanes); v_float32 high2 = vx_load(src3 + x + v_float32::nlanes); - v_pack_store(dst + x, v_pack(v_reinterpret_as_u32((values1 >= low1) & (high1 >= values1)), v_reinterpret_as_u32((values2 >= low2) & (high2 >= values2)))); + v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1), + v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2))); } vx_cleanup(); return x; diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index cd41237efdbb552b03de5070767c9cd2ac32c65e..addd5a3b5e9a58d20b1044f6968fda730f0724fd 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -1593,7 +1593,7 @@ struct op_div_f { static inline Tvec r(const Tvec& a, const Tvec& b) { - const Tvec v_zero = Tvec(); + const Tvec v_zero = vx_setall(0); return v_select(b == v_zero, v_zero, a / b); } static inline T1 r(T1 a, T1 b) @@ -1620,7 +1620,7 @@ struct op_div_scale } static inline Tvec pre(const Tvec& denom, const Tvec& res) { - const Tvec v_zero = Tvec(); + const Tvec v_zero = vx_setall(0); return v_select(denom == v_zero, v_zero, res); } static inline T1 r(T1 a, T1 denom, const T2* scalar) @@ -1860,7 +1860,7 @@ struct op_recip } static inline Tvec pre(const Tvec& denom, const Tvec& res) { - const Tvec v_zero = Tvec(); + const Tvec v_zero = vx_setall(0); return v_select(denom == v_zero, v_zero, res); } static inline T1 r(T1 denom, const T2* scalar) diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 649f6baac533d14f9a4b1b586542886429375a1b..486b7a5aba14e05b6c78558e9742cbc8b7dd9a9c 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -916,8 +916,9 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) result = true; d = 1./d; #if CV_SIMD128 - static const float CV_DECL_ALIGNED(16) inv[4] = { 0.f,-0.f,-0.f,0.f }; - v_float32x4 s0 = (v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * v_setall_f32((float)d)) ^ v_load((const float *)inv);//0123//3120 + const float d_32f = (float)d; + const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f); + v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120 s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0)); v_store_low((float*)dstdata, s0); v_store_high((float*)(dstdata + dststep), s0); @@ -946,7 +947,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method ) v_float64x2 s0 = v_load((const double*)srcdata) * det; v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det; v_float64x2 sm = v_extract<1>(s1, s0);//30 - v_float64x2 ss = v_extract<1>(s0, s1) ^ v_setall_f64(-0.);//12 + v_float64x2 ss = v_setall(0) - v_extract<1>(s0, s1);//12 v_store((double*)dstdata, v_combine_low(sm, ss));//31 v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20 #else diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index ba8a5477de94a60088f311bae93832f6ce9d28c4..1bf36bb17407ab4a3947a7aba5d5814c17579a03 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -725,7 +725,7 @@ void log32f( const float *_x, float *y, int n ) yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0); - v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift; + v_float32 delta = v_select(v_reinterpret_as_f32(h0 == vx_setall_s32(510)), vshift, vx_setall(0)); xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta); v_float32 zf0 = v_fma(xf0, vA0, vA1); diff --git a/modules/core/test/test_intrin_emulator.cpp b/modules/core/test/test_intrin_emulator.cpp index 0ae3c02b86efeb4c1b3951b9533634333c2eac89..347bc8fee1147a7125f1452dc8dfe61a56a23182 100644 --- a/modules/core/test/test_intrin_emulator.cpp +++ b/modules/core/test/test_intrin_emulator.cpp @@ -3,22 +3,14 @@ // of this distribution and at http://opencv.org/license.html. #include "test_precomp.hpp" -// see "opencv2/core/hal/intrin.hpp" -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP { -#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } - // see "opencv2/core/private/cv_cpu_include_simd_declarations.hpp" //#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -#define CV_FORCE_SIMD128_CPP +#undef CV_FORCE_SIMD128_CPP +#define CV_FORCE_SIMD128_CPP 1 #undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN #undef CV_CPU_OPTIMIZATION_NAMESPACE_END #define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace opt_EMULATOR_CPP { #define CV_CPU_OPTIMIZATION_NAMESPACE_END } #include "test_intrin128.simd.hpp" -#undef CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -#undef CV_CPU_OPTIMIZATION_NAMESPACE_END -#undef CV_CPU_DISPATCH_MODE -#undef CV_FORCE_SIMD128_CPP // tests implementation is in test_intrin_utils.hpp diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 4d2bd467371075bfb1bcebcea9a855124037088f..d8d94fdb0da4ed835f73b8d84927f0a35d18eecf 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -222,7 +222,10 @@ template std::ostream & operator<<(std::ostream & out, const Data static inline void EXPECT_COMPARE_EQ_(const T a, const T b); +template static inline void EXPECT_COMPARE_EQ_(const T a, const T b) +{ + EXPECT_EQ(a, b); +} template<> inline void EXPECT_COMPARE_EQ_(const float a, const float b) { EXPECT_FLOAT_EQ( a, b ); @@ -742,12 +745,12 @@ template struct TheTest for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((double)dataA[i*2] * (double)dataA[i*2] + - (double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]); - EXPECT_EQ((double)dataB[i*2] * (double)dataB[i*2] + - (double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]); - EXPECT_EQ((double)dataA[i*2] * (double)dataB[i*2] + - (double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]); + EXPECT_COMPARE_EQ((double)dataA[i*2] * (double)dataA[i*2] + + (double)dataA[i*2 + 1] * (double)dataA[i*2 + 1], resA[i]); + EXPECT_COMPARE_EQ((double)dataB[i*2] * (double)dataB[i*2] + + (double)dataB[i*2 + 1] * (double)dataB[i*2 + 1], resB[i]); + EXPECT_COMPARE_EQ((double)dataA[i*2] * (double)dataB[i*2] + + (double)dataA[i*2 + 1] * (double)dataB[i*2 + 1] + dataC[i], resC[i]); } #endif return *this; diff --git a/modules/features2d/src/fast_score.cpp b/modules/features2d/src/fast_score.cpp index 73126e647b2727bc0dddeb85f4d5fa17cd3506cf..0bc011af491f4ec545ea1a7cbba21eae408966b9 100644 --- a/modules/features2d/src/fast_score.cpp +++ b/modules/features2d/src/fast_score.cpp @@ -303,7 +303,8 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold) for (k = 0; k < N; k++) d[k] = (short)(v - ptr[pixel[k]]); -#if CV_SIMD128 +#if CV_SIMD128 \ + && (!defined(CV_SIMD128_CPP) || (!defined(__GNUC__) || __GNUC__ != 5)) // "movdqa" bug on "v_load(d + 1)" line (Ubuntu 16.04 + GCC 5.4) if (true) { v_int16x8 v0 = v_load(d + 1); diff --git a/modules/imgproc/src/corner.avx.cpp b/modules/imgproc/src/corner.avx.cpp index 1a62db3074818975d5a471b2ec0786526f965ad6..8d8083eee5994b25e10f5e0091f1a51ae11bc8df 100644 --- a/modules/imgproc/src/corner.avx.cpp +++ b/modules/imgproc/src/corner.avx.cpp @@ -42,6 +42,7 @@ //M*/ #include "precomp.hpp" +#undef CV_FORCE_SIMD128_CPP // expected AVX implementation only #include "opencv2/core/hal/intrin.hpp" #include "corner.hpp"