提交 baa43bca 编写于 作者: R Rich Felker

optimize scalbn family

the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.

for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.

on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
上级 7513d3ec
...@@ -11,10 +11,23 @@ scalbln: ...@@ -11,10 +11,23 @@ scalbln:
.global scalbn .global scalbn
.type scalbn,@function .type scalbn,@function
scalbn: scalbn:
fildl 12(%esp) mov 12(%esp),%eax
add $0x3ffe,%eax
cmp $0x7ffd,%eax
jb 1f
sub $0x3ffe,%eax
sar $31,%eax
xor $0xfff,%eax
add $0x3ffe,%eax
1: inc %eax
fldl 4(%esp) fldl 4(%esp)
fscale mov %eax,12(%esp)
fstp %st(1) mov $0x80000000,%eax
mov %eax,8(%esp)
xor %eax,%eax
mov %eax,4(%esp)
fldt 4(%esp)
fmulp
fstpl 4(%esp) fstpl 4(%esp)
fldl 4(%esp) fldl 4(%esp)
ret ret
...@@ -11,10 +11,22 @@ scalblnf: ...@@ -11,10 +11,22 @@ scalblnf:
.global scalbnf .global scalbnf
.type scalbnf,@function .type scalbnf,@function
scalbnf: scalbnf:
fildl 8(%esp) mov 8(%esp),%eax
add $0x3fe,%eax
cmp $0x7fd,%eax
jb 1f
sub $0x3fe,%eax
sar $31,%eax
xor $0x1ff,%eax
add $0x3fe,%eax
1: inc %eax
shl $20,%eax
flds 4(%esp) flds 4(%esp)
fscale mov %eax,8(%esp)
fstp %st(1) xor %eax,%eax
mov %eax,4(%esp)
fldl 4(%esp)
fmulp
fstps 4(%esp) fstps 4(%esp)
flds 4(%esp) flds 4(%esp)
ret ret
...@@ -11,7 +11,21 @@ scalblnl: ...@@ -11,7 +11,21 @@ scalblnl:
.global scalbnl .global scalbnl
.type scalbnl,@function .type scalbnl,@function
scalbnl: scalbnl:
fildl 16(%esp) mov 16(%esp),%eax
add $0x3ffe,%eax
cmp $0x7ffd,%eax
jae 1f
inc %eax
fldt 4(%esp)
mov %eax,12(%esp)
mov $0x80000000,%eax
mov %eax,8(%esp)
xor %eax,%eax
mov %eax,4(%esp)
fldt 4(%esp)
fmulp
ret
1: fildl 16(%esp)
fldt 4(%esp) fldt 4(%esp)
fscale fscale
fstp %st(1) fstp %st(1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册