math: new software sqrtf

same method as in sqrt, this was tested on all inputs against an sqrtf instruction. (the only difference found was that x86 sqrtf does not signal the x86 specific input-denormal exception on negative subnormal inputs while the software sqrtf does, this is fine as it was designed for ieee754 exceptions only.) there is known faster method: "Computing Floating-Point Square Roots via Bivariate Polynomial Evaluation" that computes sqrtf directly via pipelined polynomial evaluation which allows more parallelism, but the design does not generalize easily to higher precisions.

math: new software sqrtf
same method as in sqrt, this was tested on all inputs against an sqrtf instruction. (the only difference found was that x86 sqrtf does not signal the x86 specific input-denormal exception on negative subnormal inputs while the software sqrtf does, this is fine as it was designed for ieee754 exceptions only.) there is known faster method: "Computing Floating-Point Square Roots via Bivariate Polynomial Evaluation" that computes sqrtf directly via pipelined polynomial evaluation which allows more parallelism, but the design does not generalize easily to higher precisions.
b1756ec8 · Szabolcs Nagy · Rich Felker · 97e9b73d · b1756ec8
隐藏空白更改
内联并排

Showing with 70 addition and 70 deletion

src/math/sqrtf.c src/math/sqrtf.c +70 -70

未找到文件。
--- a/src/math/sqrtf.c
+++ b/src/math/sqrtf.c
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"

-static const float tiny = 1.0e-30;
+#define FENV_SUPPORT 1

-float sqrtf(float x)
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	float z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix,s,q,m,t,i;
-	uint32_t r;
+	return (uint64_t)a*b >> 32;
+}

-	GET_FLOAT_WORD(ix, x);
+/* see sqrt.c for more detailed comments.  */

-	/* take care of Inf and NaN */
-	if ((ix&0x7f800000) == 0x7f800000)
-		return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;

-	/* take care of zero */
-	if (ix <= 0) {
-		if ((ix&~sign) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix>>23;
-	if (m == 0) {  /* subnormal x */
-		for (i = 0; (ix&0x00800000) == 0; i++)
-			ix<<=1;
-		m -= i - 1;
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
 	}
-	m -= 127;  /* unbias exponent */
-	ix = (ix&0x007fffff)|0x00800000;
-	if (m&1)  /* odd m, double x to make it even */
-		ix += ix;
-	m >>= 1;  /* m = [m/2] */

-	/* generate sqrt(x) bit by bit */
-	ix += ix;
-	q = s = 0;       /* q = sqrt(x) */
-	r = 0x01000000;  /* r = moving bit from right to left */
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;

-	while (r != 0) {
-		t = s + r;
-		if (t <= ix) {
-			s = t+r;
-			ix -= t;
-			q += r;
-		}
-		ix += ix;
-		r >>= 1;
-	}
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */

-	/* use floating add to find out rounding direction */
-	if (ix != 0) {
-		z = 1.0f - tiny; /* raise inexact flag */
-		if (z >= 1.0f) {
-			z = 1.0f + tiny;
-			if (z > 1.0f)
-				q += 2;
-			else
-				q += q & 1;
-		}
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
 	}
-	ix = (q>>1) + 0x3f000000;
-	SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23));
-	return z;
+	return y;
 }