From f41359688b5d577f086cf6141c98fe2f384ed9f9 Mon Sep 17 00:00:00 2001 From: Sayed Adel Date: Wed, 20 Mar 2019 07:51:32 +0200 Subject: [PATCH] core:vsx Add support for VSX3 half precision conversions --- cmake/OpenCVCompilerOptions.cmake | 9 ++- cmake/checks/cpu_vsx_asm.cpp | 21 +++++++ modules/core/CMakeLists.txt | 2 +- .../include/opencv2/core/hal/intrin_vsx.hpp | 63 +++++++++++++++---- .../core/include/opencv2/core/vsx_utils.hpp | 2 + 5 files changed, 82 insertions(+), 15 deletions(-) create mode 100644 cmake/checks/cpu_vsx_asm.cpp diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 67d9d028e3..ee67599053 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -294,11 +294,18 @@ endif() # workaround gcc bug for aligned ld/st # https://github.com/opencv/opencv/issues/13211 if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED) - ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" "OPENCV_CHECK_VSX_ALIGNED" "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp") + ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ALIGNED "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp") if(NOT OPENCV_CHECK_VSX_ALIGNED) add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED) endif() endif() +# validate inline asm with fixes register number and constraints wa, wd, wf +if(PPC64LE) + ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ASM "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx_asm.cpp") + if(NOT OPENCV_CHECK_VSX_ASM) + add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ASM) + endif() +endif() # combine all "extra" options if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS) diff --git a/cmake/checks/cpu_vsx_asm.cpp b/cmake/checks/cpu_vsx_asm.cpp new file mode 100644 index 0000000000..bb4c25507e --- /dev/null +++ b/cmake/checks/cpu_vsx_asm.cpp @@ -0,0 +1,21 @@ +#if defined(__VSX__) + #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) + #include + #else + #error "OpenCV only supports little-endian mode" + #endif +#else + #error "VSX is not supported" +#endif + +/* + * xlc and wide versions of clang don't support %x in the inline asm template which fixes register number + * when using any of the register constraints wa, wd, wf +*/ +int main() +{ + __vector float vf; + __vector signed int vi; + __asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi)); + return 0; +} \ No newline at end of file diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 75be36fff8..285326a963 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -3,7 +3,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) ocv_add_dispatched_file(stat SSE4_2 AVX2) ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) -ocv_add_dispatched_file(convert SSE2 AVX2) +ocv_add_dispatched_file(convert SSE2 AVX2 VSX3) ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2) ocv_add_dispatched_file(matmul SSE2 AVX2) diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index e4cd47a703..c5ceb11324 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -11,11 +11,6 @@ #define CV_SIMD128 1 #define CV_SIMD128_64F 1 -/** - * todo: supporting half precision for power9 - * convert instractions xvcvhpsp, xvcvsphp -**/ - namespace cv { @@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) /////// FP16 support //////// -// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt) inline v_float32x4 v_load_expand(const float16_t* ptr) { - return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]); + vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr); +#if CV_VSX3 && defined(vec_extract_fp_from_shorth) + return v_float32x4(vec_extract_fp_from_shorth(vf16)); +#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM) + vec_float4 vf32; + __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16))); + return v_float32x4(vf32); +#else + const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000); + const vec_int4 signmask = vec_int4_sp(0x80000000); + const vec_int4 maxexp = vec_int4_sp(0x7c000000); + const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000)); + + vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16))); + vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask); + vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta + vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf)); + + t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e))); + vec_bint4 zmask = vec_cmpeq(e, z); + vec_int4 ft = vec_sel(t, zt, zmask); + return v_float32x4(vec_float4_c(vec_or(ft, sign))); +#endif } inline void v_pack_store(float16_t* ptr, const v_float32x4& v) { - float CV_DECL_ALIGNED(32) f[4]; - v_store_aligned(f, v); - ptr[0] = float16_t(f[0]); - ptr[1] = float16_t(f[1]); - ptr[2] = float16_t(f[2]); - ptr[3] = float16_t(f[3]); +// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"? +#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM) + vec_ushort8 vf16; + __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val)); + vec_st_l8(vec_mergesqe(vf16, vf16), ptr); +#else + const vec_int4 signmask = vec_int4_sp(0x80000000); + const vec_int4 rval = vec_int4_sp(0x3f000000); + + vec_int4 t = vec_int4_c(v.val); + vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16)); + t = vec_and(vec_nor(signmask, signmask), t); + + vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t); + vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000)); + vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan); + vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t); + vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval))); + tt = vec_sub(tt, rval); + vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1)); + vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff)); + nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13)); + t = vec_sel(nt, tt, tinymask); + t = vec_sel(naninf, t, finitemask); + t = vec_or(t, sign); + vec_st_l8(vec_packs(t, t), ptr); +#endif } inline void v_cleanup() {} diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp index da5b25625c..6e2baeacfc 100644 --- a/modules/core/include/opencv2/core/vsx_utils.hpp +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo) * * So we're not able to use inline asm and only use built-in functions that CLANG supports * and use __builtin_convertvector if clang missng any of vector conversions built-in functions + * + * todo: clang asm template bug is fixed, need to reconsider the current workarounds. */ // convert vector helper -- GitLab