aarch64: add single instruction math functions

this should increase performance and reduce code size on aarch64. the compiled code was checked against using __builtin_* instead of inline asm with gcc-6.2.0. lrint is two instructions. c with inline asm is used because it is safer than a pure asm implementation, this prevents ll{rint,round} to be an alias of l{rint,round} (because the types don't match) and depends on gcc style inline asm support. ceil, floor, round, trunc can either raise inexact on finite non-integer inputs or not raise any exceptions. the new implementation does not raise exceptions while the generic c code does. on aarch64, the underflow exception is signaled before rounding (ieee 754 allows both before and after rounding, but it must be consistent), the generic fma c code signals it after rounding so using single instruction fixes a slight conformance issue too.

aarch64: add single instruction math functions
this should increase performance and reduce code size on aarch64. the compiled code was checked against using __builtin_* instead of inline asm with gcc-6.2.0. lrint is two instructions. c with inline asm is used because it is safer than a pure asm implementation, this prevents ll{rint,round} to be an alias of l{rint,round} (because the types don't match) and depends on gcc style inline asm support. ceil, floor, round, trunc can either raise inexact on finite non-integer inputs or not raise any exceptions. the new implementation does not raise exceptions while the generic c code does. on aarch64, the underflow exception is signaled before rounding (ieee 754 allows both before and after rounding, but it must be consistent), the generic fma c code signals it after rounding so using single instruction fixes a slight conformance issue too.
54807d47 · Szabolcs Nagy · Rich Felker · b6e1fe0d · 54807d47 · 54807d47
34 changed file
--- a/src/math/aarch64/ceil.c
+++ b/src/math/aarch64/ceil.c
+#include <math.h>
+
+double ceil(double x)
+{
+	__asm__ ("frintp %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/ceilf.c
+++ b/src/math/aarch64/ceilf.c
+#include <math.h>
+
+float ceilf(float x)
+{
+	__asm__ ("frintp %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/fabs.c
+++ b/src/math/aarch64/fabs.c
+#include <math.h>
+
+double fabs(double x)
+{
+	__asm__ ("fabs %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/fabs.s
+++ b/src/math/aarch64/fabs.s
-.text
-.global fabs
-.type   fabs,%function
-fabs:
-	fabs d0, d0
-	ret
--- a/src/math/aarch64/fabsf.c
+++ b/src/math/aarch64/fabsf.c
+#include <math.h>
+
+float fabsf(float x)
+{
+	__asm__ ("fabs %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/fabsf.s
+++ b/src/math/aarch64/fabsf.s
-.text
-.global fabsf
-.type   fabsf,%function
-fabsf:
-	fabs s0, s0
-	ret
--- a/src/math/aarch64/floor.c
+++ b/src/math/aarch64/floor.c
+#include <math.h>
+
+double floor(double x)
+{
+	__asm__ ("frintm %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/floorf.c
+++ b/src/math/aarch64/floorf.c
+#include <math.h>
+
+float floorf(float x)
+{
+	__asm__ ("frintm %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/fma.c
+++ b/src/math/aarch64/fma.c
+#include <math.h>
+
+double fma(double x, double y, double z)
+{
+	__asm__ ("fmadd %d0, %d1, %d2, %d3" : "=w"(x) : "w"(x), "w"(y), "w"(z));
+	return x;
+}
--- a/src/math/aarch64/fmaf.c
+++ b/src/math/aarch64/fmaf.c
+#include <math.h>
+
+float fmaf(float x, float y, float z)
+{
+	__asm__ ("fmadd %s0, %s1, %s2, %s3" : "=w"(x) : "w"(x), "w"(y), "w"(z));
+	return x;
+}
--- a/src/math/aarch64/fmax.c
+++ b/src/math/aarch64/fmax.c
+#include <math.h>
+
+double fmax(double x, double y)
+{
+	__asm__ ("fmaxnm %d0, %d1, %d2" : "=w"(x) : "w"(x), "w"(y));
+	return x;
+}
--- a/src/math/aarch64/fmaxf.c
+++ b/src/math/aarch64/fmaxf.c
+#include <math.h>
+
+float fmaxf(float x, float y)
+{
+	__asm__ ("fmaxnm %s0, %s1, %s2" : "=w"(x) : "w"(x), "w"(y));
+	return x;
+}
--- a/src/math/aarch64/fmin.c
+++ b/src/math/aarch64/fmin.c
+#include <math.h>
+
+double fmin(double x, double y)
+{
+	__asm__ ("fminnm %d0, %d1, %d2" : "=w"(x) : "w"(x), "w"(y));
+	return x;
+}
--- a/src/math/aarch64/fminf.c
+++ b/src/math/aarch64/fminf.c
+#include <math.h>
+
+float fminf(float x, float y)
+{
+	__asm__ ("fminnm %s0, %s1, %s2" : "=w"(x) : "w"(x), "w"(y));
+	return x;
+}
--- a/src/math/aarch64/llrint.c
+++ b/src/math/aarch64/llrint.c
+#include <math.h>
+
+long long llrint(double x)
+{
+	long long n;
+	__asm__ (
+		"frintx %d1, %d1\n"
+		"fcvtzs %x0, %d1\n" : "=r"(n), "+w"(x));
+	return n;
+}
--- a/src/math/aarch64/llrintf.c
+++ b/src/math/aarch64/llrintf.c
+#include <math.h>
+
+long long llrintf(float x)
+{
+	long long n;
+	__asm__ (
+		"frintx %s1, %s1\n"
+		"fcvtzs %x0, %s1\n" : "=r"(n), "+w"(x));
+	return n;
+}
--- a/src/math/aarch64/llround.c
+++ b/src/math/aarch64/llround.c
+#include <math.h>
+
+long long llround(double x)
+{
+	long long n;
+	__asm__ ("fcvtas %x0, %d1" : "=r"(n) : "w"(x));
+	return n;
+}
--- a/src/math/aarch64/llroundf.c
+++ b/src/math/aarch64/llroundf.c
+#include <math.h>
+
+long long llroundf(float x)
+{
+	long long n;
+	__asm__ ("fcvtas %x0, %s1" : "=r"(n) : "w"(x));
+	return n;
+}
--- a/src/math/aarch64/lrint.c
+++ b/src/math/aarch64/lrint.c
+#include <math.h>
+
+long lrint(double x)
+{
+	long n;
+	__asm__ (
+		"frintx %d1, %d1\n"
+		"fcvtzs %x0, %d1\n" : "=r"(n), "+w"(x));
+	return n;
+}
--- a/src/math/aarch64/lrintf.c
+++ b/src/math/aarch64/lrintf.c
+#include <math.h>
+
+long lrintf(float x)
+{
+	long n;
+	__asm__ (
+		"frintx %s1, %s1\n"
+		"fcvtzs %x0, %s1\n" : "=r"(n), "+w"(x));
+	return n;
+}
--- a/src/math/aarch64/lround.c
+++ b/src/math/aarch64/lround.c
+#include <math.h>
+
+long lround(double x)
+{
+	long n;
+	__asm__ ("fcvtas %x0, %d1" : "=r"(n) : "w"(x));
+	return n;
+}
--- a/src/math/aarch64/lroundf.c
+++ b/src/math/aarch64/lroundf.c
+#include <math.h>
+
+long lroundf(float x)
+{
+	long n;
+	__asm__ ("fcvtas %x0, %s1" : "=r"(n) : "w"(x));
+	return n;
+}
--- a/src/math/aarch64/nearbyint.c
+++ b/src/math/aarch64/nearbyint.c
+#include <math.h>
+
+double nearbyint(double x)
+{
+	__asm__ ("frinti %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/nearbyintf.c
+++ b/src/math/aarch64/nearbyintf.c
+#include <math.h>
+
+float nearbyintf(float x)
+{
+	__asm__ ("frinti %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/rint.c
+++ b/src/math/aarch64/rint.c
+#include <math.h>
+
+double rint(double x)
+{
+	__asm__ ("frintx %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/rintf.c
+++ b/src/math/aarch64/rintf.c
+#include <math.h>
+
+float rintf(float x)
+{
+	__asm__ ("frintx %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/round.c
+++ b/src/math/aarch64/round.c
+#include <math.h>
+
+double round(double x)
+{
+	__asm__ ("frinta %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/roundf.c
+++ b/src/math/aarch64/roundf.c
+#include <math.h>
+
+float roundf(float x)
+{
+	__asm__ ("frinta %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/sqrt.c
+++ b/src/math/aarch64/sqrt.c
+#include <math.h>
+
+double sqrt(double x)
+{
+	__asm__ ("fsqrt %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/sqrt.s
+++ b/src/math/aarch64/sqrt.s
-.text
-.global sqrt
-.type   sqrt,%function
-sqrt:
-	fsqrt d0, d0
-	ret
--- a/src/math/aarch64/sqrtf.c
+++ b/src/math/aarch64/sqrtf.c
+#include <math.h>
+
+float sqrtf(float x)
+{
+	__asm__ ("fsqrt %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/sqrtf.s
+++ b/src/math/aarch64/sqrtf.s
-.text
-.global sqrtf
-.type   sqrtf,%function
-sqrtf:
-	fsqrt s0, s0
-	ret
--- a/src/math/aarch64/trunc.c
+++ b/src/math/aarch64/trunc.c
+#include <math.h>
+
+double trunc(double x)
+{
+	__asm__ ("frintz %d0, %d1" : "=w"(x) : "w"(x));
+	return x;
+}
--- a/src/math/aarch64/truncf.c
+++ b/src/math/aarch64/truncf.c
+#include <math.h>
+
+float truncf(float x)
+{
+	__asm__ ("frintz %s0, %s1" : "=w"(x) : "w"(x));
+	return x;
+}