use universal intrinsic for FP16

* use v_float16x4 (universal intrinsic) instead of raw SSE/NEON implementation * define v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer passed * brush up implementation on old compiler (guard correctly) * add test for v_load_f16 and round trip conversion of v_float16x4 * fix conversion error

use universal intrinsic for FP16
* use v_float16x4 (universal intrinsic) instead of raw SSE/NEON implementation * define v_load_f16/v_store_f16 since v_load can't be distinguished when short pointer passed * brush up implementation on old compiler (guard correctly) * add test for v_load_f16 and round trip conversion of v_float16x4 * fix conversion error
903789f7 · Tomoaki Teshima · 28db4a22 · 903789f7 · 903789f7 · 903789f7
4 changed file
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -275,6 +275,39 @@ struct v_float64x2
 };
 #endif

+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+template <typename T> static inline int16x4_t vreinterpret_s16_f16(T a)
+{ return (int16x4_t)a; }
+template <typename T> static inline float16x4_t vreinterpret_f16_s16(T a)
+{ return (float16x4_t)a; }
+template <typename T> static inline float16x4_t vld1_f16(const T* ptr)
+{ return vreinterpret_f16_s16(vld1_s16((const short*)ptr)); }
+template <typename T> static inline void vst1_f16(T* ptr, float16x4_t a)
+{ vst1_s16((short*)ptr, vreinterpret_s16_f16(a)); }
+static inline short vget_lane_f16(float16x4_t a, int b)
+{ return vget_lane_s16(vreinterpret_s16_f16(a), b); }
+
+struct v_float16x4
+{
+    typedef short lane_type;
+    enum { nlanes = 4 };
+
+    v_float16x4() {}
+    explicit v_float16x4(float16x4_t v) : val(v) {}
+    v_float16x4(short v0, short v1, short v2, short v3)
+    {
+        short v[] = {v0, v1, v2, v3};
+        val = vld1_f16(v);
+    }
+    short get0() const
+    {
+        return vget_lane_f16(val, 0);
+    }
+    float16x4_t val;
+};
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@@ -734,6 +767,14 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 #endif

+#if defined (HAVE_FP16)
+// Workaround for old comiplers
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(vld1_f16(ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ vst1_f16(ptr, a.val); }
+#endif
+
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1146,7 +1187,17 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 }
 #endif

+#if defined (HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+    return v_float32x4(vcvt_f32_f16(a.val));
+}

+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+    return v_float16x4(vcvt_f16_f32(a.val));
+}
+#endif

 //! @endcond


--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -252,6 +252,26 @@ struct v_float64x2
    __m128d val;
 };

+#if defined(HAVE_FP16)
+struct v_float16x4
+{
+    typedef short lane_type;
+    enum { nlanes = 4 };
+
+    v_float16x4() {}
+    explicit v_float16x4(__m128i v) : val(v) {}
+    v_float16x4(short v0, short v1, short v2, short v3)
+    {
+        val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+    __m128i val;
+};
+#endif
+
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
@@ -1021,6 +1041,13 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)

+#if defined(HAVE_FP16)
+inline v_float16x4 v_load_f16(const short* ptr)
+{ return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
+inline void v_store_f16(short* ptr, v_float16x4& a)
+{ _mm_storel_epi64((__m128i*)ptr, a.val); }
+#endif
+
 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1626,6 +1653,18 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
    return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
 }

+#if defined(HAVE_FP16)
+inline v_float32x4 v_cvt_f32(const v_float16x4& a)
+{
+    return v_float32x4(_mm_cvtph_ps(a.val));
+}
+
+inline v_float16x4 v_cvt_f16(const v_float32x4& a)
+{
+    return v_float16x4(_mm_cvtps_ph(a.val, 0));
+}
+#endif
+
 //! @endcond

 }

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -4537,16 +4537,6 @@ static short convertFp16SW(float fp32)
 }
 #endif

-#if CV_FP16 && (defined __GNUC__) && (defined __arm__ || defined __aarch64__)
-    #if 5 <= __GNUC__
-    static inline float16x4_t load_f16(const short* p) { return vld1_f16((const float16_t*)p); }
-    static inline void store_f16(short* p, float16x4_t v) { vst1_f16((float16_t*)p, v); }
-    #else
-    static inline float16x4_t load_f16(const short* p) { return (float16x4_t)vld1_s16(p); }
-    static inline void store_f16(short* p, float16x4_t v) { vst1_s16(p, (int16x4_t)v); }
-    #endif
-#endif
-
 // template for FP16 HW conversion function
 template<typename T, typename DT> static void
 cvtScaleHalf_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size);
@@ -4570,21 +4560,11 @@ cvtScaleHalf_<float, short>( const float* src, size_t sstep, short* dst, size_t
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
                {
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128 v_src = _mm_loadu_ps(src + x);
+                    v_float32x4 v_src = v_load(src + x);

-                    __m128i v_dst = _mm_cvtps_ph(v_src, 0);
+                    v_float16x4 v_dst = v_cvt_f16(v_src);

-                    _mm_storel_epi64((__m128i *)(dst + x), v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float32x4_t v_src = vld1q_f32(src + x);
-
-                    float16x4_t v_dst = vcvt_f16_f32(v_src);
-
-                    store_f16(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+                    v_store_f16(dst + x, v_dst);
                }
 #endif
            }
@@ -4626,21 +4606,11 @@ cvtScaleHalf_<short, float>( const short* src, size_t sstep, float* dst, size_t
 #if CV_FP16
                for ( ; x <= size.width - 4; x += 4)
                {
-#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
-                    __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x));
-
-                    __m128 v_dst = _mm_cvtph_ps(v_src);
+                    v_float16x4 v_src = v_load_f16(src + x);

-                    _mm_storeu_ps(dst + x, v_dst);
-#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-                    float16x4_t v_src = load_f16(src+x);
+                    v_float32x4 v_dst = v_cvt_f32(v_src);

-                    float32x4_t v_dst = vcvt_f32_f16(v_src);
-
-                    vst1q_f32(dst + x, v_dst);
-#else
-#error "Configuration error"
-#endif
+                    v_store(dst + x, v_dst);
                }
 #endif
            }

--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
+#include "test_precomp.hpp"
 #include "test_intrin_utils.hpp"
 #include <climits>

@@ -710,6 +711,49 @@ template<typename R> struct TheTest
        return *this;
    }

+#if CV_FP16
+    TheTest & test_loadstore_fp16()
+    {
+        AlignedData<R> data;
+        AlignedData<R> out;
+
+        // check if addresses are aligned and unaligned respectively
+        EXPECT_EQ((size_t)0, (size_t)&data.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&data.u.d % 16);
+        EXPECT_EQ((size_t)0, (size_t)&out.a.d % 16);
+        EXPECT_NE((size_t)0, (size_t)&out.u.d % 16);
+
+        // check some initialization methods
+        R r1 = data.u;
+        R r2 = v_load_f16(data.a.d);
+        R r3(r2);
+        EXPECT_EQ(data.u[0], r1.get0());
+        EXPECT_EQ(data.a[0], r2.get0());
+        EXPECT_EQ(data.a[0], r3.get0());
+
+        // check some store methods
+        out.a.clear();
+        v_store_f16(out.a.d, r1);
+        EXPECT_EQ(data.a, out.a);
+
+        return *this;
+    }
+
+    TheTest & test_float_cvt_fp16()
+    {
+        AlignedData<v_float32x4> data;
+
+        // check conversion
+        v_float32x4 r1 = v_load(data.a.d);
+        v_float16x4 r2 = v_cvt_f16(r1);
+        v_float32x4 r3 = v_cvt_f32(r2);
+        EXPECT_EQ(0x3c00, r2.get0());
+        EXPECT_EQ(r3.get0(), r1.get0());
+
+        return *this;
+    }
+#endif
+
 };


@@ -915,6 +959,15 @@ TEST(hal_intrin, float64x2) {
 }
 #endif

+#if CV_FP16
+TEST(hal_intrin, float16x4) {
+    TheTest<v_float16x4>()
+        .test_loadstore_fp16()
+        .test_float_cvt_fp16()
+        ;
+}
+#endif
+
 };

 };