提交 e346ff86 编写于 作者: R Rich Felker

overhaul optimized x86_64 memset asm

on most cpu models, "rep stosq" has high overhead that makes it
undesirable for small memset sizes. the new code extends the
minimal-branch fast path for short memsets from size 15 up to size
126, and shrink-wraps this code path. in addition, "rep stosq" is
sensitive to misalignment. the cost varies with size and with cpu
model, but it has been observed performing 1.5 times slower when the
destination address is not aligned mod 16. the new code thus ensures
alignment mod 16, but also preserves any existing additional
alignment, in case there are cpu models where it is beneficial.

this version is based in part on changes proposed by Denys Vlasenko.
上级 69858fa9
.global memset
.type memset,@function
memset:
movzbl %sil,%esi
mov $0x101010101010101,%rax
# 64-bit imul has 3-7 cycles latency, launch early
imul %rsi,%rax
movzbq %sil,%rax
mov $0x101010101010101,%r8
imul %r8,%rax
cmp $16,%rdx
jb 1f
cmp $126,%rdx
ja 2f
lea -1(%rdx),%rcx
mov %rdi,%r8
shr $3,%rcx
mov %rax,-8(%rdi,%rdx)
rep
stosq
mov %r8,%rax
ret
1: test %edx,%edx
test %edx,%edx
jz 1f
mov %al,(%rdi)
mov %al,-1(%rdi,%rdx)
mov %sil,(%rdi)
mov %sil,-1(%rdi,%rdx)
cmp $2,%edx
jbe 1f
mov %al,1(%rdi)
mov %al,-2(%rdi,%rdx)
cmp $4,%edx
mov %ax,1(%rdi)
mov %ax,(-1-2)(%rdi,%rdx)
cmp $6,%edx
jbe 1f
mov %eax,(1+2)(%rdi)
mov %eax,(-1-2-4)(%rdi,%rdx)
cmp $14,%edx
jbe 1f
mov %rax,(1+2+4)(%rdi)
mov %rax,(-1-2-4-8)(%rdi,%rdx)
cmp $30,%edx
jbe 1f
mov %eax,(%rdi)
mov %eax,-4(%rdi,%rdx)
cmp $8,%edx
mov %rax,(1+2+4+8)(%rdi)
mov %rax,(1+2+4+8+8)(%rdi)
mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
cmp $62,%edx
jbe 1f
mov %eax,4(%rdi)
mov %eax,-8(%rdi,%rdx)
mov %rax,(1+2+4+8+16)(%rdi)
mov %rax,(1+2+4+8+16+8)(%rdi)
mov %rax,(1+2+4+8+16+16)(%rdi)
mov %rax,(1+2+4+8+16+24)(%rdi)
mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
1: mov %rdi,%rax
ret
2: test $15,%edi
mov %rdi,%r8
mov %rax,-8(%rdi,%rdx)
mov %rdx,%rcx
jnz 2f
1: shr $3,%rcx
rep
stosq
mov %r8,%rax
ret
2: xor %edx,%edx
sub %edi,%edx
and $15,%edx
mov %rax,(%rdi)
mov %rax,8(%rdi)
sub %rdx,%rcx
add %rdx,%rdi
jmp 1b
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册