memcpy_64.S 6.1 KB
Newer Older
L
Linus Torvalds 已提交
1
/* Copyright 2002 Andi Kleen */
2

3
#include <linux/linkage.h>
4
#include <asm/errno.h>
5
#include <asm/cpufeatures.h>
6
#include <asm/alternative-asm.h>
7
#include <asm/export.h>
8

9 10 11 12 13 14 15 16 17
/*
 * We build a jump to memcpy_orig by default which gets NOPped out on
 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 */

.weak memcpy

L
Linus Torvalds 已提交
18 19 20
/*
 * memcpy - Copy a memory block.
 *
I
Ingo Molnar 已提交
21 22 23 24 25
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
L
Linus Torvalds 已提交
26 27
 * Output:
 * rax original destination
I
Ingo Molnar 已提交
28
 */
29 30 31 32
ENTRY(__memcpy)
ENTRY(memcpy)
	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
		      "jmp memcpy_erms", X86_FEATURE_ERMS
L
Linus Torvalds 已提交
33

I
Ingo Molnar 已提交
34
	movq %rdi, %rax
35 36
	movq %rdx, %rcx
	shrq $3, %rcx
I
Ingo Molnar 已提交
37
	andl $7, %edx
38
	rep movsq
I
Ingo Molnar 已提交
39
	movl %edx, %ecx
40 41
	rep movsb
	ret
42 43
ENDPROC(memcpy)
ENDPROC(__memcpy)
44 45
EXPORT_SYMBOL(memcpy)
EXPORT_SYMBOL(__memcpy)
46

47
/*
48 49
 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 * simpler than memcpy. Use memcpy_erms when possible.
50
 */
51
ENTRY(memcpy_erms)
52
	movq %rdi, %rax
53
	movq %rdx, %rcx
54 55
	rep movsb
	ret
56
ENDPROC(memcpy_erms)
57

58
ENTRY(memcpy_orig)
59
	movq %rdi, %rax
60

61
	cmpq $0x20, %rdx
62
	jb .Lhandle_tail
63

I
Ingo Molnar 已提交
64
	/*
65
	 * We check whether memory false dependence could occur,
66
	 * then jump to corresponding copy mode.
I
Ingo Molnar 已提交
67
	 */
68 69
	cmp  %dil, %sil
	jl .Lcopy_backward
70
	subq $0x20, %rdx
71 72
.Lcopy_forward_loop:
	subq $0x20,	%rdx
73

I
Ingo Molnar 已提交
74
	/*
75
	 * Move in blocks of 4x8 bytes:
I
Ingo Molnar 已提交
76
	 */
77 78 79 80 81 82 83 84 85 86 87 88
	movq 0*8(%rsi),	%r8
	movq 1*8(%rsi),	%r9
	movq 2*8(%rsi),	%r10
	movq 3*8(%rsi),	%r11
	leaq 4*8(%rsi),	%rsi

	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	2*8(%rdi)
	movq %r11,	3*8(%rdi)
	leaq 4*8(%rdi),	%rdi
	jae  .Lcopy_forward_loop
89
	addl $0x20,	%edx
90 91 92 93 94 95 96 97 98 99 100
	jmp  .Lhandle_tail

.Lcopy_backward:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx,	%rsi
	addq %rdx,	%rdi
	subq $0x20,	%rdx
	/*
	 * At most 3 ALU operations in one cycle,
101
	 * so append NOPS in the same 16 bytes trunk.
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
	 */
	.p2align 4
.Lcopy_backward_loop:
	subq $0x20,	%rdx
	movq -1*8(%rsi),	%r8
	movq -2*8(%rsi),	%r9
	movq -3*8(%rsi),	%r10
	movq -4*8(%rsi),	%r11
	leaq -4*8(%rsi),	%rsi
	movq %r8,		-1*8(%rdi)
	movq %r9,		-2*8(%rdi)
	movq %r10,		-3*8(%rdi)
	movq %r11,		-4*8(%rdi)
	leaq -4*8(%rdi),	%rdi
	jae  .Lcopy_backward_loop
117

118 119 120
	/*
	 * Calculate copy position to head.
	 */
121
	addl $0x20,	%edx
122 123
	subq %rdx,	%rsi
	subq %rdx,	%rdi
124
.Lhandle_tail:
125
	cmpl $16,	%edx
126
	jb   .Lless_16bytes
I
Ingo Molnar 已提交
127

128 129 130 131 132 133 134 135 136 137 138 139
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r8
	movq 1*8(%rsi),	%r9
	movq -2*8(%rsi, %rdx),	%r10
	movq -1*8(%rsi, %rdx),	%r11
	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	-2*8(%rdi, %rdx)
	movq %r11,	-1*8(%rdi, %rdx)
	retq
140
	.p2align 4
141
.Lless_16bytes:
142
	cmpl $8,	%edx
143 144 145 146 147 148 149 150 151 152 153
	jb   .Lless_8bytes
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi),	%r8
	movq -1*8(%rsi, %rdx),	%r9
	movq %r8,	0*8(%rdi)
	movq %r9,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_8bytes:
154
	cmpl $4,	%edx
155
	jb   .Lless_3bytes
I
Ingo Molnar 已提交
156

157 158 159 160 161 162 163 164
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	retq
165
	.p2align 4
166
.Lless_3bytes:
167 168
	subl $1, %edx
	jb .Lend
169 170 171
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
172 173 174 175 176 177 178 179
	movzbl (%rsi), %ecx
	jz .Lstore_1byte
	movzbq 1(%rsi), %r8
	movzbq (%rsi, %rdx), %r9
	movb %r8b, 1(%rdi)
	movb %r9b, (%rdi, %rdx)
.Lstore_1byte:
	movb %cl, (%rdi)
180

I
Ingo Molnar 已提交
181
.Lend:
182
	retq
183
ENDPROC(memcpy_orig)
T
Tony Luck 已提交
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

#ifndef CONFIG_UML
/*
 * memcpy_mcsafe - memory copy with machine check exception handling
 * Note that we only catch machine checks when reading the source addresses.
 * Writes to target are posted and don't generate machine checks.
 */
ENTRY(memcpy_mcsafe)
	cmpl $8, %edx
	/* Less than 8 bytes? Go to byte copy loop */
	jb .L_no_whole_words

	/* Check for bad alignment of source */
	testl $7, %esi
	/* Already aligned */
	jz .L_8byte_aligned

	/* Copy one byte at a time until source is 8-byte aligned */
	movl %esi, %ecx
	andl $7, %ecx
	subl $8, %ecx
	negl %ecx
	subl %ecx, %edx
.L_copy_leading_bytes:
	movb (%rsi), %al
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_copy_leading_bytes

.L_8byte_aligned:
	/* Figure out how many whole cache lines (64-bytes) to copy */
	movl %edx, %ecx
	andl $63, %edx
	shrl $6, %ecx
	jz .L_no_whole_cache_lines

	/* Loop copying whole cache lines */
.L_cache_w0: movq (%rsi), %r8
.L_cache_w1: movq 1*8(%rsi), %r9
.L_cache_w2: movq 2*8(%rsi), %r10
.L_cache_w3: movq 3*8(%rsi), %r11
	movq %r8, (%rdi)
	movq %r9, 1*8(%rdi)
	movq %r10, 2*8(%rdi)
	movq %r11, 3*8(%rdi)
.L_cache_w4: movq 4*8(%rsi), %r8
.L_cache_w5: movq 5*8(%rsi), %r9
.L_cache_w6: movq 6*8(%rsi), %r10
.L_cache_w7: movq 7*8(%rsi), %r11
	movq %r8, 4*8(%rdi)
	movq %r9, 5*8(%rdi)
	movq %r10, 6*8(%rdi)
	movq %r11, 7*8(%rdi)
	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi
	decl %ecx
	jnz .L_cache_w0

	/* Are there any trailing 8-byte words? */
.L_no_whole_cache_lines:
	movl %edx, %ecx
	andl $7, %edx
	shrl $3, %ecx
	jz .L_no_whole_words

	/* Copy trailing words */
.L_copy_trailing_words:
	movq (%rsi), %r8
	mov %r8, (%rdi)
	leaq 8(%rsi), %rsi
	leaq 8(%rdi), %rdi
	decl %ecx
	jnz .L_copy_trailing_words

	/* Any trailing bytes? */
.L_no_whole_words:
	andl %edx, %edx
	jz .L_done_memcpy_trap

	/* Copy trailing bytes */
	movl %edx, %ecx
.L_copy_trailing_bytes:
	movb (%rsi), %al
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_copy_trailing_bytes

275
	/* Copy successful. Return zero */
T
Tony Luck 已提交
276 277 278 279
.L_done_memcpy_trap:
	xorq %rax, %rax
	ret
ENDPROC(memcpy_mcsafe)
280
EXPORT_SYMBOL_GPL(memcpy_mcsafe)
T
Tony Luck 已提交
281 282

	.section .fixup, "ax"
283
	/* Return -EFAULT for any failure */
T
Tony Luck 已提交
284
.L_memcpy_mcsafe_fail:
285
	mov	$-EFAULT, %rax
T
Tony Luck 已提交
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
	ret

	.previous

	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
#endif