diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index 3cc8fcf6b718295f4c977da2d7957d7d53d9c421..2d3f5e52b8afe8d612c0fd0b04f4e8a01f078489 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,43 +1,72 @@ .global memset .type memset,@function memset: - movzbl %sil,%esi - mov $0x101010101010101,%rax - # 64-bit imul has 3-7 cycles latency, launch early - imul %rsi,%rax + movzbq %sil,%rax + mov $0x101010101010101,%r8 + imul %r8,%rax - cmp $16,%rdx - jb 1f + cmp $126,%rdx + ja 2f - lea -1(%rdx),%rcx - mov %rdi,%r8 - shr $3,%rcx - mov %rax,-8(%rdi,%rdx) - rep - stosq - mov %r8,%rax - ret - -1: test %edx,%edx + test %edx,%edx jz 1f - mov %al,(%rdi) - mov %al,-1(%rdi,%rdx) + mov %sil,(%rdi) + mov %sil,-1(%rdi,%rdx) cmp $2,%edx jbe 1f - mov %al,1(%rdi) - mov %al,-2(%rdi,%rdx) - cmp $4,%edx + mov %ax,1(%rdi) + mov %ax,(-1-2)(%rdi,%rdx) + cmp $6,%edx + jbe 1f + + mov %eax,(1+2)(%rdi) + mov %eax,(-1-2-4)(%rdi,%rdx) + cmp $14,%edx + jbe 1f + + mov %rax,(1+2+4)(%rdi) + mov %rax,(-1-2-4-8)(%rdi,%rdx) + cmp $30,%edx jbe 1f - mov %eax,(%rdi) - mov %eax,-4(%rdi,%rdx) - cmp $8,%edx + mov %rax,(1+2+4+8)(%rdi) + mov %rax,(1+2+4+8+8)(%rdi) + mov %rax,(-1-2-4-8-16)(%rdi,%rdx) + mov %rax,(-1-2-4-8-8)(%rdi,%rdx) + cmp $62,%edx jbe 1f - mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rdx) + mov %rax,(1+2+4+8+16)(%rdi) + mov %rax,(1+2+4+8+16+8)(%rdi) + mov %rax,(1+2+4+8+16+16)(%rdi) + mov %rax,(1+2+4+8+16+24)(%rdi) + mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx) + mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx) + mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx) + mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx) 1: mov %rdi,%rax ret + +2: test $15,%edi + mov %rdi,%r8 + mov %rax,-8(%rdi,%rdx) + mov %rdx,%rcx + jnz 2f + +1: shr $3,%rcx + rep + stosq + mov %r8,%rax + ret + +2: xor %edx,%edx + sub %edi,%edx + and $15,%edx + mov %rax,(%rdi) + mov %rax,8(%rdi) + sub %rdx,%rcx + add %rdx,%rdi + jmp 1b