提交 903789f7 编写于 作者: T Tomoaki Teshima

use universal intrinsic for FP16

  * use v_float16x4 (universal intrinsic) instead of raw SSE/NEON implementation
  * define v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer passed
  * brush up implementation on old compiler (guard correctly)
  * add test for v_load_f16 and round trip conversion of v_float16x4
  * fix conversion error
上级 28db4a22
......@@ -275,6 +275,39 @@ struct v_float64x2
};
#endif
#if defined (HAVE_FP16)
// Workaround for old comiplers
template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
{ return (int16x4_t)a; }
template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
{ return (float16x4_t)a; }
template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
{ return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); }
template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
{ vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); }
static inline short vget_lane_f16(float16x4_t a, int b)
{ return vget_lane_s16(vreinterpret_s16_f16(a), b); }
struct v_float16x4
{
typedef short lane_type;
enum { nlanes = 4 };
v_float16x4() {}
explicit v_float16x4(float16x4_t v) : val(v) {}
v_float16x4(short v0, short v1, short v2, short v3)
{
short v[] = {v0, v1, v2, v3};
val = vld1_f16(v);
}
short get0() const
{
return vget_lane_f16(val, 0);
}
float16x4_t val;
};
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
......@@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif
#if defined (HAVE_FP16)
// Workaround for old comiplers
inline v_float16x4 v_load_f16(const short* ptr)
{ return v_float16x4(vld1_f16(ptr)); }
inline void v_store_f16(short* ptr, v_float16x4& a)
{ vst1_f16(ptr, a.val); }
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
}
#endif
#if defined (HAVE_FP16)
inline v_float32x4 v_cvt_f32(const v_float16x4& a)
{
return v_float32x4(vcvt_f32_f16(a.val));
}
inline v_float16x4 v_cvt_f16(const v_float32x4& a)
{
return v_float16x4(vcvt_f16_f32(a.val));
}
#endif
//! @endcond
......
......@@ -252,6 +252,26 @@ struct v_float64x2
__m128d val;
};
#if defined(HAVE_FP16)
struct v_float16x4
{
typedef short lane_type;
enum { nlanes = 4 };
v_float16x4() {}
explicit v_float16x4(__m128i v) : val(v) {}
v_float16x4(short v0, short v1, short v2, short v3)
{
val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
}
short get0() const
{
return (short)_mm_cvtsi128_si32(val);
}
__m128i val;
};
#endif
#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
......@@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
#if defined(HAVE_FP16)
inline v_float16x4 v_load_f16(const short* ptr)
{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
inline void v_store_f16(short* ptr, v_float16x4& a)
{ _mm_storel_epi64((__m128i*)ptr, a.val); }
#endif
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
}
#if defined(HAVE_FP16)
inline v_float32x4 v_cvt_f32(const v_float16x4& a)
{
return v_float32x4(_mm_cvtph_ps(a.val));
}
inline v_float16x4 v_cvt_f16(const v_float32x4& a)
{
return v_float16x4(_mm_cvtps_ph(a.val, 0));
}
#endif
//! @endcond
}
......
......@@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32)
}
#endif
#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__)
#if 5 <= __GNUC__
static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); }
static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); }
#else
static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); }
static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); }
#endif
#endif
// template for FP16 HW conversion function
template<typename T, typename DT> static void
cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
......@@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
__m128 v_src = _mm_loadu_ps(src + x);
v_float32x4 v_src = v_load(src + x);
__m128i v_dst = _mm_cvtps_ph(v_src, 0);
v_float16x4 v_dst = v_cvt_f16(v_src);
_mm_storel_epi64((__m128i *)(dst + x), v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float32x4_t v_src = vld1q_f32(src + x);
float16x4_t v_dst = vcvt_f16_f32(v_src);
store_f16(dst + x, v_dst);
#else
#error "Configuration error"
#endif
v_store_f16(dst + x, v_dst);
}
#endif
}
......@@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
#if CV_FP16
for ( ; x <= size.width - 4; x += 4)
{
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
__m128i v_src = _mm_loadl_epi64((__m128i*)(src+x));
__m128 v_dst = _mm_cvtph_ps(v_src);
v_float16x4 v_src = v_load_f16(src + x);
_mm_storeu_ps(dst + x, v_dst);
#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
float16x4_t v_src = load_f16(src+x);
v_float32x4 v_dst = v_cvt_f32(v_src);
float32x4_t v_dst = vcvt_f32_f16(v_src);
vst1q_f32(dst + x, v_dst);
#else
#error "Configuration error"
#endif
v_store(dst + x, v_dst);
}
#endif
}
......
#include "test_precomp.hpp"
#include "test_intrin_utils.hpp"
#include <climits>
......@@ -710,6 +711,49 @@ template<typename R> struct TheTest
return *this;
}
#if CV_FP16
TheTest & test_loadstore_fp16()
{
AlignedData<R> data;
AlignedData<R> out;
// check if addresses are aligned and unaligned respectively
EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
// check some initialization methods
R r1 = data.u;
R r2 = v_load_f16(data.a.d);
R r3(r2);
EXPECT_EQ(data.u[0], r1.get0());
EXPECT_EQ(data.a[0], r2.get0());
EXPECT_EQ(data.a[0], r3.get0());
// check some store methods
out.a.clear();
v_store_f16(out.a.d, r1);
EXPECT_EQ(data.a, out.a);
return *this;
}
TheTest & test_float_cvt_fp16()
{
AlignedData<v_float32x4> data;
// check conversion
v_float32x4 r1 = v_load(data.a.d);
v_float16x4 r2 = v_cvt_f16(r1);
v_float32x4 r3 = v_cvt_f32(r2);
EXPECT_EQ(0x3c00, r2.get0());
EXPECT_EQ(r3.get0(), r1.get0());
return *this;
}
#endif
};
......@@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) {
}
#endif
#if CV_FP16
TEST(hal_intrin, float16x4) {
TheTest<v_float16x4>()
.test_loadstore_fp16()
.test_float_cvt_fp16()
;
}
#endif
};
};
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册