提交 e346ff86 编写于 作者: R Rich Felker

overhaul optimized x86_64 memset asm

on most cpu models, "rep stosq" has high overhead that makes it
undesirable for small memset sizes. the new code extends the
minimal-branch fast path for short memsets from size 15 up to size
126, and shrink-wraps this code path. in addition, "rep stosq" is
sensitive to misalignment. the cost varies with size and with cpu
model, but it has been observed performing 1.5 times slower when the
destination address is not aligned mod 16. the new code thus ensures
alignment mod 16, but also preserves any existing additional
alignment, in case there are cpu models where it is beneficial.

this version is based in part on changes proposed by Denys Vlasenko.
上级 69858fa9
.global memset .global memset
.type memset,@function .type memset,@function
memset: memset:
movzbl %sil,%esi movzbq %sil,%rax
mov $0x101010101010101,%rax mov $0x101010101010101,%r8
# 64-bit imul has 3-7 cycles latency, launch early imul %r8,%rax
imul %rsi,%rax
cmp $16,%rdx cmp $126,%rdx
jb 1f ja 2f
lea -1(%rdx),%rcx test %edx,%edx
mov %rdi,%r8
shr $3,%rcx
mov %rax,-8(%rdi,%rdx)
rep
stosq
mov %r8,%rax
ret
1: test %edx,%edx
jz 1f jz 1f
mov %al,(%rdi) mov %sil,(%rdi)
mov %al,-1(%rdi,%rdx) mov %sil,-1(%rdi,%rdx)
cmp $2,%edx cmp $2,%edx
jbe 1f jbe 1f
mov %al,1(%rdi) mov %ax,1(%rdi)
mov %al,-2(%rdi,%rdx) mov %ax,(-1-2)(%rdi,%rdx)
cmp $4,%edx cmp $6,%edx
jbe 1f
mov %eax,(1+2)(%rdi)
mov %eax,(-1-2-4)(%rdi,%rdx)
cmp $14,%edx
jbe 1f
mov %rax,(1+2+4)(%rdi)
mov %rax,(-1-2-4-8)(%rdi,%rdx)
cmp $30,%edx
jbe 1f jbe 1f
mov %eax,(%rdi) mov %rax,(1+2+4+8)(%rdi)
mov %eax,-4(%rdi,%rdx) mov %rax,(1+2+4+8+8)(%rdi)
cmp $8,%edx mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
cmp $62,%edx
jbe 1f jbe 1f
mov %eax,4(%rdi) mov %rax,(1+2+4+8+16)(%rdi)
mov %eax,-8(%rdi,%rdx) mov %rax,(1+2+4+8+16+8)(%rdi)
mov %rax,(1+2+4+8+16+16)(%rdi)
mov %rax,(1+2+4+8+16+24)(%rdi)
mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
1: mov %rdi,%rax 1: mov %rdi,%rax
ret ret
2: test $15,%edi
mov %rdi,%r8
mov %rax,-8(%rdi,%rdx)
mov %rdx,%rcx
jnz 2f
1: shr $3,%rcx
rep
stosq
mov %r8,%rax
ret
2: xor %edx,%edx
sub %edi,%edx
and $15,%edx
mov %rax,(%rdi)
mov %rax,8(%rdi)
sub %rdx,%rcx
add %rdx,%rdi
jmp 1b
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册