提交 f4135968 编写于 作者: S Sayed Adel

core:vsx Add support for VSX3 half precision conversions

上级 6c862fae
......@@ -294,11 +294,18 @@ endif()
# workaround gcc bug for aligned ld/st
# https://github.com/opencv/opencv/issues/13211
if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED)
ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" "OPENCV_CHECK_VSX_ALIGNED" "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ALIGNED "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
if(NOT OPENCV_CHECK_VSX_ALIGNED)
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED)
endif()
endif()
# validate inline asm with fixes register number and constraints wa, wd, wf
if(PPC64LE)
ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ASM "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx_asm.cpp")
if(NOT OPENCV_CHECK_VSX_ASM)
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ASM)
endif()
endif()
# combine all "extra" options
if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS)
......
#if defined(__VSX__)
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
#include <altivec.h>
#else
#error "OpenCV only supports little-endian mode"
#endif
#else
#error "VSX is not supported"
#endif
/*
* xlc and wide versions of clang don't support %x<n> in the inline asm template which fixes register number
* when using any of the register constraints wa, wd, wf
*/
int main()
{
__vector float vf;
__vector signed int vi;
__asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi));
return 0;
}
\ No newline at end of file
......@@ -3,7 +3,7 @@ set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
ocv_add_dispatched_file(convert SSE2 AVX2)
ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
ocv_add_dispatched_file(convert_scale SSE2 AVX2)
ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
ocv_add_dispatched_file(matmul SSE2 AVX2)
......
......@@ -11,11 +11,6 @@
#define CV_SIMD128 1
#define CV_SIMD128_64F 1
/**
* todo: supporting half precision for power9
* convert instractions xvcvhpsp, xvcvsphp
**/
namespace cv
{
......@@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
/////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
return v_float32x4(vec_extract_fp_from_shorth(vf16));
#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
vec_float4 vf32;
__asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
return v_float32x4(vf32);
#else
const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
const vec_int4 signmask = vec_int4_sp(0x80000000);
const vec_int4 maxexp = vec_int4_sp(0x7c000000);
const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
vec_bint4 zmask = vec_cmpeq(e, z);
vec_int4 ft = vec_sel(t, zt, zmask);
return v_float32x4(vec_float4_c(vec_or(ft, sign)));
#endif
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
float CV_DECL_ALIGNED(32) f[4];
v_store_aligned(f, v);
ptr[0] = float16_t(f[0]);
ptr[1] = float16_t(f[1]);
ptr[2] = float16_t(f[2]);
ptr[3] = float16_t(f[3]);
// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
vec_ushort8 vf16;
__asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
#else
const vec_int4 signmask = vec_int4_sp(0x80000000);
const vec_int4 rval = vec_int4_sp(0x3f000000);
vec_int4 t = vec_int4_c(v.val);
vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
t = vec_and(vec_nor(signmask, signmask), t);
vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
tt = vec_sub(tt, rval);
vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
t = vec_sel(nt, tt, tinymask);
t = vec_sel(naninf, t, finitemask);
t = vec_or(t, sign);
vec_st_l8(vec_packs(t, t), ptr);
#endif
}
inline void v_cleanup() {}
......
......@@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
*
* So we're not able to use inline asm and only use built-in functions that CLANG supports
* and use __builtin_convertvector if clang missng any of vector conversions built-in functions
*
* todo: clang asm template bug is fixed, need to reconsider the current workarounds.
*/
// convert vector helper
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册