memset_64.S 2.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/* Copyright 2002 Andi Kleen, SuSE Labs */
2 3 4 5

#include <linux/linkage.h>
#include <asm/dwarf2.h>

L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14
/*
 * ISO C memset - set a memory block to a byte value.
 *	
 * rdi   destination
 * rsi   value (char) 
 * rdx   count (bytes) 
 * 
 * rax   original destination
 */	
15 16
	.section .altinstr_replacement, "ax", @progbits
.Lmemset_c:
17 18 19 20 21 22 23 24 25 26 27 28 29 30
	movq %rdi,%r9
	movl %edx,%r8d
	andl $7,%r8d
	movl %edx,%ecx
	shrl $3,%ecx
	/* expand byte value  */
	movzbl %sil,%esi
	movabs $0x0101010101010101,%rax
	mulq %rsi		/* with rax, clobbers rdx */
	rep stosq
	movl %r8d,%ecx
	rep stosb
	movq %r9,%rax
	ret
31 32
.Lmemset_e:
	.previous
33 34 35 36

ENTRY(memset)
ENTRY(__memset)
	CFI_STARTPROC
37 38 39 40 41 42 43 44 45 46 47 48
	movq %rdi,%r10
	movq %rdx,%r11

	/* expand byte value  */
	movzbl %sil,%ecx
	movabs $0x0101010101010101,%rax
	mul    %rcx		/* with rax, clobbers rdx */

	/* align dst */
	movl  %edi,%r9d
	andl  $7,%r9d
	jnz  .Lbad_alignment
49
	CFI_REMEMBER_STATE
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
.Lafter_bad_alignment:

	movl %r11d,%ecx
	shrl $6,%ecx
	jz	 .Lhandle_tail

	.p2align 4
.Lloop_64:
	decl   %ecx
	movq  %rax,(%rdi)
	movq  %rax,8(%rdi)
	movq  %rax,16(%rdi)
	movq  %rax,24(%rdi)
	movq  %rax,32(%rdi)
	movq  %rax,40(%rdi)
	movq  %rax,48(%rdi)
	movq  %rax,56(%rdi)
	leaq  64(%rdi),%rdi
	jnz    .Lloop_64

	/* Handle tail in loops. The loops should be faster than hard
	   to predict jump tables. */
	.p2align 4
.Lhandle_tail:
	movl	%r11d,%ecx
	andl    $63&(~7),%ecx
	jz 		.Lhandle_7
	shrl	$3,%ecx
	.p2align 4
.Lloop_8:
	decl   %ecx
	movq  %rax,(%rdi)
	leaq  8(%rdi),%rdi
	jnz    .Lloop_8

.Lhandle_7:
	movl	%r11d,%ecx
	andl	$7,%ecx
	jz      .Lende
	.p2align 4
.Lloop_1:
	decl    %ecx
	movb 	%al,(%rdi)
	leaq	1(%rdi),%rdi
	jnz     .Lloop_1

.Lende:
	movq	%r10,%rax
	ret

100
	CFI_RESTORE_STATE
101 102 103 104 105 106 107 108 109
.Lbad_alignment:
	cmpq $7,%r11
	jbe	.Lhandle_7
	movq %rax,(%rdi)	/* unaligned store */
	movq $8,%r8
	subq %r9,%r8
	addq %r8,%rdi
	subq %r8,%r11
	jmp .Lafter_bad_alignment
110 111 112 113
.Lfinal:
	CFI_ENDPROC
ENDPROC(memset)
ENDPROC(__memset)
114 115 116 117 118 119 120 121

	/* Some CPUs run faster using the string instructions.
	   It is also a lot simpler. Use this when possible */

#include <asm/cpufeature.h>

	.section .altinstructions,"a"
	.align 8
122
	.quad memset
123
	.quad .Lmemset_c
124
	.word X86_FEATURE_REP_GOOD
125
	.byte .Lfinal - memset
126
	.byte .Lmemset_e - .Lmemset_c
127
	.previous