提交 6a6ccf60 编写于 作者: M Maksim Shabunin

v_extract universal intrinsic

上级 11c3fa52
......@@ -566,6 +566,7 @@ inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
c.s[i] = a.s[i];
c.s[i+(n/2)] = b.s[i];
}
return c;
}
template<typename _Tp, int n>
......@@ -577,6 +578,7 @@ inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>&
c.s[i] = a.s[i+(n/2)];
c.s[i+(n/2)] = b.s[i+(n/2)];
}
return c;
}
template<typename _Tp, int n>
......@@ -592,6 +594,18 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
}
}
template<int s, typename _Tp, int n>
inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
{
v_reg<_Tp, n> r;
int i = 0;
for (; i < s; ++i)
r.s[i] = a.s[i+n-s];
for (; i < n; ++i)
r.s[i] = b.s[i-s];
return r;
}
template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
{
v_reg<int, n> c;
......
......@@ -557,6 +557,8 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
......@@ -720,6 +722,23 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
template <int s> \
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
}
OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
inline v_int32x4 v_round(const v_float32x4& a)
{
static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
......
......@@ -1149,6 +1149,17 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
{
const int w = sizeof(typename _Tpvec::lane_type);
const int n = _Tpvec::nlanes;
__m128i ra, rb;
ra = _mm_srli_si128(a.val, s*w);
rb = _mm_slli_si128(b.val, (n-s)*w);
return _Tpvec(_mm_or_si128(ra, rb));
}
inline v_int32x4 v_round(const v_float32x4& a)
{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册