提交 027769bf 编写于 作者: C Chip Kerchner 提交者: Alexander Alekhin

Merge pull request #15662 from ChipKerchner:addVReverseIntrinsic

* New v_reverse HAL intrinsic for reversing the ordering of a vector

* Fix conflict.

* Try to resolve conflict again.

* Try one more time.

* Add _MM_SHUFFLE. Remove non-vectorize code in SSE2. Fix copy and paste issue with NEON.

* Change v_uint16x8 SSE2 version to use shuffles
上级 9ca92499
......@@ -1012,6 +1012,54 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
/** Reverse **/
inline v_uint8x32 v_reverse(const v_uint8x32 &a)
{
static const __m256i perm = _mm256_setr_epi8(
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
}
inline v_int8x32 v_reverse(const v_int8x32 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x16 v_reverse(const v_uint16x16 &a)
{
static const __m256i perm = _mm256_setr_epi8(
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
}
inline v_int16x16 v_reverse(const v_int16x16 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x8 v_reverse(const v_uint32x8 &a)
{
static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
}
inline v_int32x8 v_reverse(const v_int32x8 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x8 v_reverse(const v_float32x8 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x4 v_reverse(const v_uint64x4 &a)
{
return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
}
inline v_int64x4 v_reverse(const v_int64x4 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x4 v_reverse(const v_float64x4 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
////////// Reduce and mask /////////
/** Reduce **/
......
......@@ -1068,6 +1068,79 @@ OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd)
/** Reverse **/
inline v_uint8x64 v_reverse(const v_uint8x64 &a)
{
#if CV_AVX_512VBMI
static const __m512i perm = _mm512_set_epi32(
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
#else
static const __m512i shuf = _mm512_set_epi32(
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
__m512i vec = _mm512_shuffle_epi8(a.val, shuf);
return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
#endif
}
inline v_int8x64 v_reverse(const v_int8x64 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x32 v_reverse(const v_uint16x32 &a)
{
#if CV_AVX_512VBMI
static const __m512i perm = _mm512_set_epi32(
0x00000001, 0x00020003, 0x00040005, 0x00060007,
0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
0x00100011, 0x00120013, 0x00140015, 0x00160017,
0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
#else
static const __m512i shuf = _mm512_set_epi32(
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
__m512i vec = _mm512_shuffle_epi8(a.val, shuf);
return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
#endif
}
inline v_int16x32 v_reverse(const v_int16x32 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x16 v_reverse(const v_uint32x16 &a)
{
static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
}
inline v_int32x16 v_reverse(const v_int32x16 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x16 v_reverse(const v_float32x16 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x8 v_reverse(const v_uint64x8 &a)
{
static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
}
inline v_int64x8 v_reverse(const v_int64x8 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x8 v_reverse(const v_float64x8 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
////////// Reduce /////////
/** Reduce **/
......
......@@ -112,6 +112,7 @@ These operations allow to reorder or recombine elements in one or multiple vecto
- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
- Reverse: @ref v_reverse
- Extract: @ref v_extract
......@@ -215,6 +216,7 @@ Regular integers:
|cvt_flt32 | | | | | | x |
|cvt_flt64 | | | | | | x |
|transpose4x4 | | | | | x | x |
|reverse | x | x | x | x | x | x |
Big integers:
......@@ -224,6 +226,7 @@ Big integers:
|add, sub | x | x |
|shift | x | x |
|logical | x | x |
|reverse | x | x |
|extract | x | x |
|rotate (lanes) | x | x |
|cvt_flt64 | | x |
......@@ -250,6 +253,7 @@ Floating point:
|transpose4x4 | x | |
|extract | x | x |
|rotate (lanes) | x | x |
|reverse | x | x |
@{ */
......@@ -1724,6 +1728,23 @@ inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
}
}
/** @brief Vector reverse order
Reverse the order of the vector
Scheme:
@code
REG {A1 ... An} ==> REG {An ... A1}
@endcode
For all types. */
template<typename _Tp, int n>
inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
{
v_reg<_Tp, n> c;
for( int i = 0; i < n; i++ )
c.s[i] = a.s[n-i-1];
return c;
}
/** @brief Vector extract
Scheme:
......
......@@ -1585,6 +1585,52 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
#endif
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
uint8x16_t vec = vrev64q_u8(a.val);
return v_uint8x16(vextq_u8(vec, vec, 8));
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
uint16x8_t vec = vrev64q_u16(a.val);
return v_uint16x8(vextq_u16(vec, vec, 4));
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
uint32x4_t vec = vrev64q_u32(a.val);
return v_uint32x4(vextq_u32(vec, vec, 2));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
uint64x2_t vec = a.val;
uint64x1_t vec_lo = vget_low_u64(vec);
uint64x1_t vec_hi = vget_high_u64(vec);
return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
#if CV_SIMD128_64F
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
#endif
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
template <int s> \
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
......
......@@ -1914,6 +1914,59 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
#if CV_SSSE3
static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
#else
uchar CV_DECL_ALIGNED(32) d[16];
v_store_aligned(d, a);
return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
#endif
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
#if CV_SSSE3
static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
#else
__m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
return v_uint16x8(r);
#endif
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
{
......
......@@ -678,6 +678,53 @@ OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
/* Reverse */
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_uint8x16(vec_perm(vec, vec, perm));
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
/* Extract */
template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
......
......@@ -1115,6 +1115,22 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_reverse()
{
Data<R> dataA;
R a = dataA;
Data<R> resB = v_reverse(a);
for (int i = 0; i < R::nlanes; ++i)
{
SCOPED_TRACE(cv::format("i=%d", i));
EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]);
}
return *this;
}
template<int s>
TheTest & test_extract()
{
......@@ -1459,6 +1475,7 @@ void test_hal_intrin_uint8()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_pack_b()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
......@@ -1497,6 +1514,7 @@ void test_hal_intrin_int8()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
......@@ -1529,6 +1547,7 @@ void test_hal_intrin_uint16()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
......@@ -1561,6 +1580,7 @@ void test_hal_intrin_int16()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
......@@ -1590,6 +1610,7 @@ void test_hal_intrin_uint32()
.test_popcount()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_transpose()
......@@ -1619,6 +1640,7 @@ void test_hal_intrin_int32()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_float_cvt32()
......@@ -1637,6 +1659,7 @@ void test_hal_intrin_uint64()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_reverse()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
......@@ -1650,6 +1673,7 @@ void test_hal_intrin_int64()
.test_addsub()
.test_shift<1>().test_shift<8>()
.test_logic()
.test_reverse()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
.test_cvt64_double()
......@@ -1680,6 +1704,7 @@ void test_hal_intrin_float32()
.test_matmul()
.test_transpose()
.test_reduce_sum4()
.test_reverse()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
;
......@@ -1709,6 +1734,7 @@ void test_hal_intrin_float64()
.test_unpack()
.test_float_math()
.test_float_cvt32()
.test_reverse()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册