memcpy_64.S 2.3 KB
Newer Older
L
Linus Torvalds 已提交
1
/* Copyright 2002 Andi Kleen */
2

3
#include <linux/linkage.h>
I
Ingo Molnar 已提交
4

5
#include <asm/cpufeature.h>
I
Ingo Molnar 已提交
6
#include <asm/dwarf2.h>
7

L
Linus Torvalds 已提交
8 9 10
/*
 * memcpy - Copy a memory block.
 *
I
Ingo Molnar 已提交
11 12 13 14 15
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
L
Linus Torvalds 已提交
16 17
 * Output:
 * rax original destination
I
Ingo Molnar 已提交
18
 */
L
Linus Torvalds 已提交
19

I
Ingo Molnar 已提交
20 21 22
/*
 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 *
23
 * This gets patched over the unrolled variant (below) via the
I
Ingo Molnar 已提交
24 25
 * alternative instructions framework:
 */
26 27
	.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
I
Ingo Molnar 已提交
28 29 30 31 32
	movq %rdi, %rax

	movl %edx, %ecx
	shrl $3, %ecx
	andl $7, %edx
33
	rep movsq
I
Ingo Molnar 已提交
34
	movl %edx, %ecx
35 36
	rep movsb
	ret
37 38
.Lmemcpy_e:
	.previous
39 40 41 42

ENTRY(__memcpy)
ENTRY(memcpy)
	CFI_STARTPROC
43

I
Ingo Molnar 已提交
44 45 46 47 48 49 50
	/*
	 * Put the number of full 64-byte blocks into %ecx.
	 * Tail portion is handled at the end:
	 */
	movq %rdi, %rax
	movl %edx, %ecx
	shrl   $6, %ecx
51 52 53 54
	jz .Lhandle_tail

	.p2align 4
.Lloop_64:
I
Ingo Molnar 已提交
55 56 57 58 59
	/*
	 * We decrement the loop index here - and the zero-flag is
	 * checked at the end of the loop (instructions inbetween do
	 * not change the zero flag):
	 */
60 61
	decl %ecx

I
Ingo Molnar 已提交
62 63 64 65 66 67 68
	/*
	 * Move in blocks of 4x16 bytes:
	 */
	movq 0*8(%rsi),		%r11
	movq 1*8(%rsi),		%r8
	movq %r11,		0*8(%rdi)
	movq %r8,		1*8(%rdi)
69

I
Ingo Molnar 已提交
70 71 72 73
	movq 2*8(%rsi),		%r9
	movq 3*8(%rsi),		%r10
	movq %r9,		2*8(%rdi)
	movq %r10,		3*8(%rdi)
74

I
Ingo Molnar 已提交
75 76 77 78
	movq 4*8(%rsi),		%r11
	movq 5*8(%rsi),		%r8
	movq %r11,		4*8(%rdi)
	movq %r8,		5*8(%rdi)
79

I
Ingo Molnar 已提交
80 81 82 83
	movq 6*8(%rsi),		%r9
	movq 7*8(%rsi),		%r10
	movq %r9,		6*8(%rdi)
	movq %r10,		7*8(%rdi)
84

I
Ingo Molnar 已提交
85 86
	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi
87 88 89 90

	jnz  .Lloop_64

.Lhandle_tail:
I
Ingo Molnar 已提交
91 92 93
	movl %edx, %ecx
	andl  $63, %ecx
	shrl   $3, %ecx
94
	jz   .Lhandle_7
I
Ingo Molnar 已提交
95

96 97 98
	.p2align 4
.Lloop_8:
	decl %ecx
I
Ingo Molnar 已提交
99 100 101 102
	movq (%rsi),		%r8
	movq %r8,		(%rdi)
	leaq 8(%rdi),		%rdi
	leaq 8(%rsi),		%rsi
103 104 105
	jnz  .Lloop_8

.Lhandle_7:
I
Ingo Molnar 已提交
106 107 108 109
	movl %edx, %ecx
	andl $7, %ecx
	jz .Lend

110 111
	.p2align 4
.Lloop_1:
I
Ingo Molnar 已提交
112 113
	movb (%rsi), %r8b
	movb %r8b, (%rdi)
114 115 116 117 118
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

I
Ingo Molnar 已提交
119
.Lend:
120
	ret
121 122 123
	CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
124

I
Ingo Molnar 已提交
125 126 127 128
	/*
	 * Some CPUs run faster using the string copy instructions.
	 * It is also a lot simpler. Use this when possible:
	 */
129

I
Ingo Molnar 已提交
130
	.section .altinstructions, "a"
131
	.align 8
132
	.quad memcpy
133
	.quad .Lmemcpy_c
134
	.byte X86_FEATURE_REP_GOOD
I
Ingo Molnar 已提交
135 136 137 138 139 140

	/*
	 * Replace only beginning, memcpy is used to apply alternatives,
	 * so it is silly to overwrite itself with nops - reboot is the
	 * only outcome...
	 */
141 142
	.byte .Lmemcpy_e - .Lmemcpy_c
	.byte .Lmemcpy_e - .Lmemcpy_c
143
	.previous