From d9fe9db943d4e855a75424978d7ab87fd54edd98 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Sat, 17 Mar 2018 02:14:53 -0400 Subject: [PATCH] hardfloat: implement float32/64 comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Performance results for fp-bench: Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz - before: cmp-single: 110.98 MFlops cmp-double: 107.12 MFlops - after: cmp-single: 506.28 MFlops cmp-double: 524.77 MFlops Note that flattening both eq and eq_signaling versions would give us extra performance (695v506, 615v524 Mflops for single/double, respectively) but this would emit two essentially identical functions for each eq/signaling pair, which is a waste. Aggregate performance improvement for the last few patches: [ all charts in png: https://imgur.com/a/4yV8p ] 1. Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz qemu-aarch64 NBench score; higher is better Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz 16 +-+-----------+-------------+----===-------+---===-------+-----------+-+ 14 +-+..........................@@@&&.=.......@@@&&.=...................+-+ 12 +-+..........................@.@.&.=.......@.@.&.=.....+befor=== +-+ 10 +-+..........................@.@.&.=.......@.@.&.=.....+ad@@&& = +-+ 8 +-+.......................$$$%.@.&.=.......@.@.&.=.....+ @@u& = +-+ 6 +-+............@@@&&=+***##.$%.@.&.=***##$$%+@.&.=..###$$%%@i& = +-+ 4 +-+.......###$%%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=+**.#+$ +@m& = +-+ 2 +-+.....***.#$.%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=.**.#+$+sqr& = +-+ 0 +-+-----***##$%%@@&&=-***##$$%@@&&==***##$$%@@&&==-**##$$%+cmp==-----+-+ FOURIER NEURAL NELU DECOMPOSITION gmean qemu-aarch64 SPEC06fp (test set) speedup over QEMU 4c2c1015905 Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz error bars: 95% confidence interval 4.5 +-+---+-----+----+-----+-----+-&---+-----+----+-----+-----+-----+----+-----+-----+-----+-----+----+-----+---+-+ 4 +-+..........................+@@+...........................................................................+-+ 3.5 +-+..............%%@&.........@@..............%%@&............................................+++dsub +-+ 2.5 +-+....&&+.......%%@&.......+%%@..+%%&+..@@&+.%%@&....................................+%%&+.+%@&++%%@& +-+ 2 +-+..+%%&..+%@&+.%%@&...+++..%%@...%%&.+$$@&..%%@&..%%@&.......+%%&+.%%@&+......+%%@&.+%%&++$$@&++d%@& %%@&+-+ 1.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+f%@&**$%@&+-+ 0.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&+sqr@&**$%@&+-+ 0 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+cmp&**$%@&+-+ 410.bw416.gam433.434.z435.436.cac437.lesli444.447.de450.so453454.ca459.GemsF465.tont470.lb4482.sphinxgeomean 2. Host: ARM Aarch64 A57 @ 2.4GHz qemu-aarch64 NBench score; higher is better Host: Applied Micro X-Gene, Aarch64 A57 @ 2.4 GHz 5 +-+-----------+-------------+-------------+-------------+-----------+-+ 4.5 +-+........................................@@@&==...................+-+ 3 4 +-+..........................@@@&==........@.@&.=.....+before +-+ 3 +-+..........................@.@&.=........@.@&.=.....+ad@@@&== +-+ 2.5 +-+.....................##$$%%.@&.=........@.@&.=.....+ @m@& = +-+ 2 +-+............@@@&==.***#.$.%.@&.=.***#$$%%.@&.=.***#$$%%d@& = +-+ 1.5 +-+.....***#$$%%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$ +f@& = +-+ 0.5 +-+.....*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$+sqr& = +-+ 0 +-+-----***#$$%%@@&==-***#$$%%@@&==-***#$$%%@@&==-***#$$%+cmp==-----+-+ FOURIER NEURAL NLU DECOMPOSITION gmean Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Alex Bennée --- fpu/softfloat.c | 109 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 14 deletions(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index fbd66fd8dc..59eac97d10 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2903,28 +2903,109 @@ static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, } } -#define COMPARE(sz) \ -int float ## sz ## _compare(float ## sz a, float ## sz b, \ - float_status *s) \ +#define COMPARE(name, attr, sz) \ +static int attr \ +name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ { \ FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, false, s); \ -} \ -int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ - float_status *s) \ -{ \ - FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ - FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, true, s); \ + return compare_floats(pa, pb, is_quiet, s); \ } -COMPARE(16) -COMPARE(32) -COMPARE(64) +COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) +COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) +COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) #undef COMPARE +int float16_compare(float16 a, float16 b, float_status *s) +{ + return soft_f16_compare(a, b, false, s); +} + +int float16_compare_quiet(float16 a, float16 b, float_status *s) +{ + return soft_f16_compare(a, b, true, s); +} + +static int QEMU_FLATTEN +f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) +{ + union_float32 ua, ub; + + ua.s = xa; + ub.s = xb; + + if (QEMU_NO_HARDFLOAT) { + goto soft; + } + + float32_input_flush2(&ua.s, &ub.s, s); + if (isgreaterequal(ua.h, ub.h)) { + if (isgreater(ua.h, ub.h)) { + return float_relation_greater; + } + return float_relation_equal; + } + if (likely(isless(ua.h, ub.h))) { + return float_relation_less; + } + /* The only condition remaining is unordered. + * Fall through to set flags. + */ + soft: + return soft_f32_compare(ua.s, ub.s, is_quiet, s); +} + +int float32_compare(float32 a, float32 b, float_status *s) +{ + return f32_compare(a, b, false, s); +} + +int float32_compare_quiet(float32 a, float32 b, float_status *s) +{ + return f32_compare(a, b, true, s); +} + +static int QEMU_FLATTEN +f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) +{ + union_float64 ua, ub; + + ua.s = xa; + ub.s = xb; + + if (QEMU_NO_HARDFLOAT) { + goto soft; + } + + float64_input_flush2(&ua.s, &ub.s, s); + if (isgreaterequal(ua.h, ub.h)) { + if (isgreater(ua.h, ub.h)) { + return float_relation_greater; + } + return float_relation_equal; + } + if (likely(isless(ua.h, ub.h))) { + return float_relation_less; + } + /* The only condition remaining is unordered. + * Fall through to set flags. + */ + soft: + return soft_f64_compare(ua.s, ub.s, is_quiet, s); +} + +int float64_compare(float64 a, float64 b, float_status *s) +{ + return f64_compare(a, b, false, s); +} + +int float64_compare_quiet(float64 a, float64 b, float_status *s) +{ + return f64_compare(a, b, true, s); +} + /* Multiply A by 2 raised to the power N. */ static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) { -- GitLab