add feature to convert FP32(float) to FP16(half)

* check compiler support * check HW support before executing * add test doing round trip conversion from / to FP32 * treat array correctly if size is not multiple of 4 * add declaration to prevent warning * make it possible to enable fp16 on 32bit ARM * let the conversion possible on non-supported HW, too. * add test using both HW and SW implementation

add feature to convert FP32(float) to FP16(half)
* check compiler support * check HW support before executing * add test doing round trip conversion from / to FP32 * treat array correctly if size is not multiple of 4 * add declaration to prevent warning * make it possible to enable fp16 on 32bit ARM * let the conversion possible on non-supported HW, too. * add test using both HW and SW implementation
b2ad7cd9 · Tomoaki Teshima · c3d1f94e · b2ad7cd9 · b2ad7cd9 · b2ad7cd9
8 changed file
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -146,8 +146,11 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  elseif(X86 OR X86_64)
    add_extra_compiler_option(-mno-sse2)
  endif()
+  if(ARM)
+    add_extra_compiler_option("-mfp16-format=ieee")
+  endif(ARM)
  if(ENABLE_NEON)
-    add_extra_compiler_option("-mfpu=neon")
+    add_extra_compiler_option("-mfpu=neon-fp16")
  endif()
  if(ENABLE_VFPV3 AND NOT ENABLE_NEON)
    add_extra_compiler_option("-mfpu=vfpv3")
@@ -167,6 +170,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
        add_extra_compiler_option(-mfma)
      endif()
    endif()
+    if((X86 OR X86_64) AND NOT MSVC)
+        add_extra_compiler_option(-mf16c)
+    endif((X86 OR X86_64) AND NOT MSVC)
    # GCC depresses SSEx instructions when -mavx is used. Instead, it generates new AVX instructions or AVX equivalence for all SSEx instructions when needed.
    if(NOT OPENCV_EXTRA_CXX_FLAGS MATCHES "-mavx")

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -524,6 +524,17 @@ For example:
 CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
                                  double alpha = 1, double beta = 0);
+/** @brief Converts an array to half precision floating number.
+convertFp16 converts FP32 to FP16 or FP16 to FP32.  The input array has to have type of CV_32F or
+CV_16S to represent the bit depth.  If the input array is neither of them, it'll do nothing.
+@param src input array.
+@param dst output array.
+@param useHW if possible use HW SIMD instruction to convert
+*/
+CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst, bool useHW = true);
 /** @brief Performs a look-up table transform of an array.
 The function LUT fills the output array with values from the look-up table. Indices of the entries

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -112,7 +112,7 @@
 #define CV_CPU_SSE4_1           6
 #define CV_CPU_SSE4_2           7
 #define CV_CPU_POPCNT           8
+#define CV_CPU_FP16             9
 #define CV_CPU_AVX              10
 #define CV_CPU_AVX2             11
 #define CV_CPU_FMA3             12
@@ -143,7 +143,7 @@ enum CpuFeatures {
    CPU_SSE4_1          = 6,
    CPU_SSE4_2          = 7,
    CPU_POPCNT          = 8,
+    CPU_FP16            = 9,
    CPU_AVX             = 10,
    CPU_AVX2            = 11,
    CPU_FMA3            = 12,
@@ -193,6 +193,10 @@ enum CpuFeatures {
 #    endif
 #    define CV_POPCNT 1
 #  endif
+#  if defined __F16C__ || (defined _MSC_VER && _MSC_VER >= 1700)
+#    include <immintrin.h>
+#    define CV_FP16 1
+#  endif
 #  if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600 && 0)
 // MS Visual Studio 2010 (2012?) has no macro pre-defined to identify the use of /arch:AVX
 // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32
@@ -223,6 +227,10 @@ enum CpuFeatures {
 #  define CV_NEON 1
 #endif
+#if defined __GNUC__ && ((defined (__arm__) && (__ARM_FP & 0x2)) || defined(__aarch64__))
+#    define CV_FP16 1
+#endif
 #if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
 #  define CV_VFP 1
 #endif
@@ -253,6 +261,9 @@ enum CpuFeatures {
 #ifndef CV_SSE4_2
 #  define CV_SSE4_2 0
 #endif
+#ifndef CV_FP16
+#  define CV_FP16 0
+#endif
 #ifndef CV_AVX
 #  define CV_AVX 0
 #endif

--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -4356,6 +4356,283 @@ struct Cvt_SIMD<float, int>
 #endif
+#if !(defined (__arm__) || defined (__aarch64__))
+// const numbers for floating points format
+const unsigned int kShiftSignificand    = 13;
+const unsigned int kMaskFp16Significand = 0x3ff;
+const unsigned int kBiasFp16Exponent    = 15;
+const unsigned int kBiasFp32Exponent    = 127;
+union fp32Int32
+{
+    int i;
+    float f;
+    struct _fp32Format
+    {
+        unsigned int significand : 23;
+        unsigned int exponent    : 8;
+        unsigned int sign        : 1;
+    } fmt;
+};
+#endif
+union fp16Int16
+{
+    short i;
+#if defined (__arm__) || defined (__aarch64__)
+    __fp16 h;
+#endif
+    struct _fp16Format
+    {
+        unsigned int significand : 10;
+        unsigned int exponent    : 5;
+        unsigned int sign        : 1;
+    } fmt;
+};
+#if defined (__arm__) || defined (__aarch64__)
+static float convertFp16SW(short fp16)
+{
+    // Fp16 -> Fp32
+    fp16Int16 a;
+    a.i = fp16;
+    return (float)a.h;
+}
+#else
+static float convertFp16SW(short fp16)
+{
+    // Fp16 -> Fp32
+    fp16Int16 b;
+    b.i = fp16;
+    int exponent    = b.fmt.exponent - kBiasFp16Exponent;
+    int significand = b.fmt.significand;
+    fp32Int32 a;
+    a.i = 0;
+    a.fmt.sign = b.fmt.sign; // sign bit
+    if( exponent == 16 )
+    {
+        // Inf or NaN
+        a.i = a.i | 0x7F800000;
+        if( significand != 0 )
+        {
+            // NaN
+#if defined(__x86_64__) || defined(_M_X64)
+            // 64bit
+            a.i = a.i | 0x7FC00000;
+#endif
+            a.fmt.significand = a.fmt.significand | (significand << kShiftSignificand);
+        }
+        return a.f;
+    }
+    else if ( exponent == -15 )
+    {
+        // subnormal in Fp16
+        if( significand == 0 )
+        {
+            // zero
+            return a.f;
+        }
+        else
+        {
+            int shift = -1;
+            while( ( significand & 0x400 ) == 0 )
+            {
+                significand = significand << 1;
+                shift++;
+            }
+            significand = significand & kMaskFp16Significand;
+            exponent -= shift;
+        }
+    }
+    a.fmt.exponent = (exponent+kBiasFp32Exponent);
+    a.fmt.significand = significand << kShiftSignificand;
+    return a.f;
+}
+#endif
+#if defined (__arm__) || defined (__aarch64__)
+static short convertFp16SW(float fp32)
+{
+    // Fp32 -> Fp16
+    fp16Int16 a;
+    a.h = (__fp16)fp32;
+    return a.i;
+}
+#else
+static short convertFp16SW(float fp32)
+{
+    // Fp32 -> Fp16
+    fp32Int32 a;
+    a.f = fp32;
+    int exponent    = a.fmt.exponent - kBiasFp32Exponent;
+    int significand = a.fmt.significand;
+    fp16Int16 result;
+    result.i = 0;
+    if( 0x477ff000 <= ( a.i & 0x7fffffff ) )
+    {
+        // Inf in Fp16
+        result.i = result.i | 0x7C00;
+        if( exponent == 128 && significand != 0 )
+        {
+            // NaN
+            result.i = (short)(result.i | 0x200 | (significand >> kShiftSignificand));
+        }
+    }
+    else if ( ( a.i & 0x7fffffff ) <= 0x387fe000 )
+    {
+        // subnormal in Fp16
+        int fp16Significand = significand | 0x800000;
+        int bitShift = (-exponent) - 1;
+        fp16Significand = fp16Significand >> bitShift;
+        // special cases to round up
+        int threshold = 0x8000 + ( ( fp16Significand & 1 ) ? 0 : 1 );
+        if( threshold <= ( significand & 0xffff ) )
+        {
+            fp16Significand++;
+        }
+        result.i = (short)fp16Significand;
+    }
+    else
+    {
+        // usual situation
+        // exponent
+        result.fmt.exponent = (exponent + kBiasFp16Exponent);
+        // significand;
+        short fp16Significand = (short)(significand >> kShiftSignificand);
+        result.fmt.significand = fp16Significand;
+        // special cases to round up
+        short lsb10bitsFp32 = (significand & 0x1fff);
+        short threshold = 0x1000 + ( ( fp16Significand & 0x1 ) ? 0 : 1 );
+        if( threshold <= lsb10bitsFp32 )
+        {
+            result.i++;
+        }
+        else if ( fp16Significand == 0x3ff && exponent == -15)
+        {
+            result.i++;
+        }
+    }
+    // sign bit
+    result.fmt.sign = a.fmt.sign;
+    return result.i;
+}
+#endif
+template<typename T, typename DT> static void
+cvtScaleHalfSW_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        for ( int x = 0 ; x < size.width; x ++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+// template for FP16 HW conversion function
+template<typename T, typename DT> static void
+cvtScaleHalfHW_( const T* src, size_t sstep, DT* dst, size_t dstep, Size size)
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        for ( ; x < size.width; x++ )
+        {
+        }
+    }
+}
+template<> void
+cvtScaleHalfHW_<float, short>( const float* src, size_t sstep, short* dst, size_t dstep, Size size)
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
+        {
+#if CV_FP16
+            for ( ; x <= size.width - 4; x += 4)
+            {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+                __m128 v_src = _mm_load_ps(src + x);
+                __m128i v_dst = _mm_cvtps_ph(v_src, 0);
+                _mm_storel_epi64((__m128i *)(dst + x), v_dst);
+#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+                float32x4_t v_src = *(float32x4_t*)(src + x);
+                float16x4_t v_dst = vcvt_f16_f32(v_src);
+                *(float16x4_t*)(dst + x) = v_dst;
+#endif
+            }
+#endif
+        }
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
+template<> void
+cvtScaleHalfHW_<short, float>( const short* src, size_t sstep, float* dst, size_t dstep, Size size)
+{
+    sstep /= sizeof(src[0]);
+    dstep /= sizeof(dst[0]);
+    for( ; size.height--; src += sstep, dst += dstep )
+    {
+        int x = 0;
+        if ( ( (intptr_t)dst & 0xf ) == 0 && ( (intptr_t)src & 0xf ) == 0 )
+        {
+#if CV_FP16
+            for ( ; x <= size.width - 4; x += 4)
+            {
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) || defined(i386)
+                __m128i v_src = _mm_loadl_epi64((__m128i*)(src+x));
+                __m128 v_dst = _mm_cvtph_ps(v_src);
+                _mm_store_ps((dst + x), v_dst);
+#elif defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+                float16x4_t v_src = *(float16x4_t*)(src + x);
+                float32x4_t v_dst = vcvt_f32_f16(v_src);
+                *(float32x4_t*)(dst + x) = v_dst;
+#endif
+            }
+#endif
+        }
+        for ( ; x < size.width; x++ )
+        {
+            dst[x] = convertFp16SW(src[x]);
+        }
+    }
+}
 template<typename T, typename DT> static void
 cvt_( const T* src, size_t sstep,
      DT* dst, size_t dstep, Size size )
@@ -4443,6 +4720,13 @@ static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, s
    tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
 }
+#define DEF_CVT_SCALE_FP16_FUNC(suffix, stype, dtype, resource) \
+static void cvtScaleHalf##suffix##resource( const stype* src, size_t sstep, const uchar*, size_t, \
+dtype* dst, size_t dstep, Size size, double*) \
+{ \
+    cvtScaleHalf##resource##_<stype,dtype>(src, sstep, dst, dstep, size); \
+}
 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
 dtype* dst, size_t dstep, Size size, double* scale) \
@@ -4499,6 +4783,11 @@ DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
+DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short, SW)
+DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float, SW)
+DEF_CVT_SCALE_FP16_FUNC(32f16f, float, short, HW)
+DEF_CVT_SCALE_FP16_FUNC(16f32f, short, float, HW)
 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
@@ -4620,6 +4909,30 @@ static BinaryFunc getCvtScaleAbsFunc(int depth)
    return cvtScaleAbsTab[depth];
 }
+BinaryFunc getConvertFuncFp16(int ddepth, bool useHW)
+{
+    static BinaryFunc cvtTabHW[] =
+    {
+        0, 0, 0,
+        (BinaryFunc)(cvtScaleHalf32f16fHW), 0, (BinaryFunc)(cvtScaleHalf16f32fHW),
+        0, 0,
+    };
+    static BinaryFunc cvtTabSW[] =
+    {
+        0, 0, 0,
+        (BinaryFunc)(cvtScaleHalf32f16fSW), 0, (BinaryFunc)(cvtScaleHalf16f32fSW),
+        0, 0,
+    };
+    if( useHW == true)
+    {
+        return cvtTabHW[CV_MAT_DEPTH(ddepth)];
+    }
+    else
+    {
+        return cvtTabSW[CV_MAT_DEPTH(ddepth)];
+    }
+}
 BinaryFunc getConvertFunc(int sdepth, int ddepth)
 {
    static BinaryFunc cvtTab[][8] =
@@ -4804,6 +5117,52 @@ void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, doubl
    }
 }
+void cv::convertFp16( InputArray _src, OutputArray _dst, bool useHW )
+{
+    if ( checkHardwareSupport(CV_CPU_FP16) == false)
+    {
+        useHW = false;
+    }
+    Mat src = _src.getMat();
+    int ddepth = 0;
+    switch( src.depth() )
+    {
+    case CV_32F:
+        ddepth = CV_16S;
+        break;
+    case CV_16S:
+        ddepth = CV_32F;
+        break;
+    default:
+        return;
+    }
+    int type = CV_MAKETYPE(ddepth, src.channels());
+    _dst.create( src.dims, src.size, type );
+    Mat dst = _dst.getMat();
+    BinaryFunc func = getConvertFuncFp16(ddepth, useHW);
+    int cn = src.channels();
+    CV_Assert( func != 0 );
+    if( src.dims <= 2 )
+    {
+        Size sz = getContinuousSize(src, dst, cn);
+        func( src.data, src.step, 0, 0, dst.data, dst.step, sz, 0);
+    }
+    else
+    {
+        const Mat* arrays[] = {&src, &dst, 0};
+        uchar* ptrs[2];
+        NAryMatIterator it(arrays, ptrs);
+        Size sz((int)(it.size*cn), 1);
+        for( size_t i = 0; i < it.nplanes; i++, ++it )
+            func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, 0);
+    }
+}
 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
 {
    bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -135,6 +135,7 @@ typedef void (*BinaryFuncC)(const uchar* src1, size_t step1,
                       uchar* dst, size_t step, int width, int height,
                       void*);
+BinaryFunc getConvertFuncFp16(int ddepth, bool useHW);
 BinaryFunc getConvertFunc(int sdepth, int ddepth);
 BinaryFunc getCopyMaskFunc(size_t esz);

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -291,6 +291,7 @@ struct HWFeatures
            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+            f.have[CV_CPU_FP16]   = (cpuid_data[2] & (1<<29)) != 0;
            // make the second call to the cpuid command in order to get
            // information about extended features like AVX2
@@ -338,7 +339,8 @@ struct HWFeatures
    #if defined ANDROID || defined __linux__
    #ifdef __aarch64__
        f.have[CV_CPU_NEON] = true;
-    #else
+        f.have[CV_CPU_FP16] = true;
+    #elif defined __arm__
        int cpufile = open("/proc/self/auxv", O_RDONLY);
        if (cpufile >= 0)
@@ -351,6 +353,7 @@ struct HWFeatures
                if (auxv.a_type == AT_HWCAP)
                {
                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    f.have[CV_CPU_FP16] = (auxv.a_un.a_val & 2) != 0;
                    break;
                }
            }
@@ -358,8 +361,13 @@ struct HWFeatures
            close(cpufile);
        }
    #endif
-    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+    #elif (defined __clang__ || defined __APPLE__)
+    #if (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
        f.have[CV_CPU_NEON] = true;
+    #endif
+    #if (defined __ARM_FP  && (((__ARM_FP & 0x2) != 0) && defined __ARM_NEON__))
+        f.have[CV_CPU_FP16] = true;
+    #endif
    #endif
        return f;

--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -737,6 +737,60 @@ struct ConvertScaleOp : public BaseElemWiseOp
    int ddepth;
 };
+struct ConvertScaleFp16Op : public BaseElemWiseOp
+{
+    ConvertScaleFp16Op() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)), nextRange(0) { }
+    void op(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        convertFp16(src[0], dst, true);
+    }
+    void refop(const vector<Mat>& src, Mat& dst, const Mat&)
+    {
+        convertFp16(src[0], dst, false);
+    }
+    int getRandomType(RNG&)
+    {
+        // 0: FP32 -> FP16
+        // 1: FP16 -> FP32
+        int srctype = (nextRange & 1) == 0 ? CV_32F : CV_16S;
+        return srctype;
+    }
+    void getValueRange(int, double& minval, double& maxval)
+    {
+        // 0: FP32 -> FP16
+        // 1: FP16 -> FP32
+        if( (nextRange & 1) == 0 )
+        {
+            // largest integer number that fp16 can express
+            maxval = 65504.f;
+            minval = -maxval;
+        }
+        else
+        {
+            // 0: positive number range
+            // 1: negative number range
+            if( (nextRange & 2) == 0 )
+            {
+                minval = 0;      // 0x0000 +0
+                maxval = 31744;  // 0x7C00 +Inf
+            }
+            else
+            {
+                minval = -32768; // 0x8000 -0
+                maxval = -1024;  // 0xFC00 -Inf
+            }
+        }
+    }
+    double getMaxErr(int)
+    {
+        return 0.5f;
+    }
+    void generateScalars(int, RNG& rng)
+    {
+        nextRange = rng.next();
+    }
+    int nextRange;
+};
 struct ConvertScaleAbsOp : public BaseElemWiseOp
 {
@@ -1371,6 +1425,7 @@ INSTANTIATE_TEST_CASE_P(Core_Copy, ElemWiseTest, ::testing::Values(ElemWiseOpPtr
 INSTANTIATE_TEST_CASE_P(Core_Set, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetOp)));
 INSTANTIATE_TEST_CASE_P(Core_SetZero, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::SetZeroOp)));
 INSTANTIATE_TEST_CASE_P(Core_ConvertScale, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleOp)));
+INSTANTIATE_TEST_CASE_P(Core_ConvertScaleFp16, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleFp16Op)));
 INSTANTIATE_TEST_CASE_P(Core_ConvertScaleAbs, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::ConvertScaleAbsOp)));
 INSTANTIATE_TEST_CASE_P(Core_Add, ElemWiseTest, ::testing::Values(ElemWiseOpPtr(new cvtest::AddOp)));

--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -3064,6 +3064,9 @@ void printVersionInfo(bool useStdOut)
 #if CV_NEON
    if (checkHardwareSupport(CV_CPU_NEON)) cpu_features += " neon";
 #endif
+#if CV_FP16
+    if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16";
+#endif
    cpu_features.erase(0, 1); // erase initial space