optimize exponential asm for i386

up to 30% faster exp2 by avoiding slow frndint and fscale functions. expm1 also takes a much more direct path for small arguments (the expected usage case).

optimize exponential asm for i386
up to 30% faster exp2 by avoiding slow frndint and fscale functions. expm1 also takes a much more direct path for small arguments (the expected usage case).
02db27d9 · Rich Felker · da7458a6 · 02db27d9 · 02db27d9
隐藏空白更改
内联并排

Showing with 77 addition and 58 deletion

src/math/i386/exp.s src/math/i386/exp.s +76 -11

src/math/i386/expm1.s src/math/i386/expm1.s +1 -47

未找到文件。
--- a/src/math/i386/exp.s
+++ b/src/math/i386/exp.s
+.global expm1f
+.type expm1f,@function
+expm1f:
+	flds 4(%esp)
+	jmp 1f
+.global expm1l
+.type expm1l,@function
+expm1l:
+	fldt 4(%esp)
+	jmp 1f
+.global expm1
+.type expm1,@function
+expm1:
+	fldl 4(%esp)
+1:	fldl2e
+	fmulp
+	fld1
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw %ax
+	fstp %st(0)
+	fstp %st(0)
+	sahf
+	ja 1f
+	f2xm1
+	ret
+1:	call 1f
+	fld1
+	fsubrp
+	ret
 .global exp2f
 .type exp2f,@function
 exp2f:
@@ -34,22 +68,53 @@ exp:
 .type exp2,@function
 exp2:
 	fldl 4(%esp)
-1:	fxam
+1:	mov $0x47000000,%eax
-	fnstsw %ax
+	push %eax
+	flds (%esp)
+	shl $7,%eax
+	push %eax
+	add %eax,%eax
+	push %eax
+	fld %st(1)
+	fabs
+	fucom %st(1)
+	fnstsw
 	sahf
-	jnp 1f
+	ja 2f
-	jnc 1f
+	fstp %st(0)
-	fstps 4(%esp)
+	fstp %st(0)
-	mov $0xfe,%al
+	fld %st(0)
-	and %al,7(%esp)
+	fistpl 8(%esp)
-	flds 4(%esp)
+	fildl 8(%esp)
-1:	fld %st(0)
-	frndint
 	fxch %st(1)
 	fsub %st(1)
+	mov $0x3fff,%eax
+	add %eax,8(%esp)
 	f2xm1
 	fld1
 	faddp
-	fscale
+	fldt (%esp)
+	fmulp
 	fstp %st(1)
+	add $12,%esp
+	ret
+2:	fstp %st(0)
+	fstp %st(0)
+	fsts 8(%esp)
+	mov 8(%esp),%eax
+	lea (%eax,%eax),%ecx
+	cmp $0xff000000,%ecx
+	ja 2f
+	fstp %st(0)
+	xor %ecx,%ecx
+	inc %ecx
+	add %eax,%eax
+	jc 1f
+	mov $0x7ffe,%ecx
+1:	mov %ecx,8(%esp)
+	fldt (%esp)
+	fld %st(0)
+	fmulp
+2:	add $12,%esp
 	ret
--- a/src/math/i386/expm1.s
+++ b/src/math/i386/expm1.s
-.global expm1f
+# see exp.s
-.type expm1f,@function
-expm1f:
-	flds 4(%esp)
-	jmp 1f
-.global expm1l
-.type expm1l,@function
-expm1l:
-	fldt 4(%esp)
-	jmp 1f
-.global expm1
-.type expm1,@function
-expm1:
-	fldl 4(%esp)
-1:	fxam
-	fnstsw %ax
-	sahf
-	jnp 1f
-	jnc 1f
-	fstps 4(%esp)
-	mov $0xfe,%al
-	and %al,7(%esp)
-	flds 4(%esp)
-1:	fldl2e
-	fmulp
-	fld %st(0)
-	frndint
-	fldz
-	fcomp
-	fnstsw %ax
-	sahf
-	jnz 1f
-	fstp %st(0)
-	f2xm1
-	ret
-1:	fxch %st(1)
-	fsub %st(1)
-	f2xm1
-	fld1
-	faddp
-	fscale
-	fld1
-	fsubrp
-	fstp %st(1)
-	ret