提交 926272dd 编写于 作者: R Rich Felker

optimized memset asm for i386 and x86_64

the concept of both versions is the same; they differ only in details.
for long runs, they use "rep movsl" or "rep movsq", and for small
runs, they use a trick, writing from both ends towards the middle,
that reduces the number of branches needed. in addition, if memset is
called multiple times with the same length, all branches will be
predicted; there are no loops.

for larger runs, there are likely faster approaches than "rep", at
least on some cpu models. for 32-bit, it's unlikely that there is any
faster approach that does not require non-baseline instructions; doing
anything fancier would require inspecting cpu capabilities. for
64-bit, there may very well be faster versions that work on all
models; further optimization could be explored in the future.

with these changes, memset is anywhere between 50% faster and 6 times
faster, depending on the cpu model and the length and alignment of the
destination buffer.
上级 4a1f55e9
.global memset
.type memset,@function
memset:
mov 8(%esp),%al
push %edi
mov %al,%ah
mov %al,%dl
mov 16(%esp),%ecx
shl $16,%eax
mov 8(%esp),%edi
mov %dl,%al
mov %dl,%ah
cmp $16,%ecx
jb 1f
mov %eax,-4(%edi,%ecx)
shr $2,%ecx
rep
stosl
mov 8(%esp),%eax
pop %edi
ret
1: test %ecx,%ecx
jz 1f
mov %al,(%edi)
mov %al,-1(%edi,%ecx)
cmp $2,%ecx
jbe 1f
mov %al,1(%edi)
mov %al,-2(%edi,%ecx)
cmp $4,%ecx
jbe 1f
mov %eax,(%edi)
mov %eax,-4(%edi,%ecx)
cmp $8,%ecx
jbe 1f
mov %eax,4(%edi)
mov %eax,-8(%edi,%ecx)
1: mov 8(%esp),%eax
pop %edi
ret
.global memset
.type memset,@function
memset:
and $0xff,%esi
mov $0x101010101010101,%rax
mov %rdx,%rcx
mov %rdi,%r8
imul %rsi,%rax
cmp $16,%rcx
jb 1f
mov %rax,-8(%rdi,%rcx)
shr $3,%rcx
rep
stosq
mov %r8,%rax
ret
1: test %ecx,%ecx
jz 1f
mov %al,(%rdi)
mov %al,-1(%rdi,%rcx)
cmp $2,%ecx
jbe 1f
mov %al,1(%rdi)
mov %al,-2(%rdi,%rcx)
cmp $4,%ecx
jbe 1f
mov %eax,(%rdi)
mov %eax,-4(%rdi,%rcx)
cmp $8,%ecx
jbe 1f
mov %eax,4(%rdi)
mov %eax,-8(%rdi,%rcx)
1: mov %r8,%rax
ret
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册