memcpy_64.S 5.9 KB
Newer Older
L
Linus Torvalds 已提交
1
/* Copyright 2002 Andi Kleen */
2

3
#include <linux/linkage.h>
4
#include <asm/errno.h>
5
#include <asm/cpufeatures.h>
6
#include <asm/mcsafe_test.h>
7
#include <asm/alternative-asm.h>
8
#include <asm/export.h>
9

10 11 12 13 14 15 16 17 18
/*
 * We build a jump to memcpy_orig by default which gets NOPped out on
 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 */

.weak memcpy

L
Linus Torvalds 已提交
19 20 21
/*
 * memcpy - Copy a memory block.
 *
I
Ingo Molnar 已提交
22 23 24 25 26
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
L
Linus Torvalds 已提交
27 28
 * Output:
 * rax original destination
I
Ingo Molnar 已提交
29
 */
30 31 32 33
ENTRY(__memcpy)
ENTRY(memcpy)
	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
		      "jmp memcpy_erms", X86_FEATURE_ERMS
L
Linus Torvalds 已提交
34

I
Ingo Molnar 已提交
35
	movq %rdi, %rax
36 37
	movq %rdx, %rcx
	shrq $3, %rcx
I
Ingo Molnar 已提交
38
	andl $7, %edx
39
	rep movsq
I
Ingo Molnar 已提交
40
	movl %edx, %ecx
41 42
	rep movsb
	ret
43 44
ENDPROC(memcpy)
ENDPROC(__memcpy)
45 46
EXPORT_SYMBOL(memcpy)
EXPORT_SYMBOL(__memcpy)
47

48
/*
49 50
 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 * simpler than memcpy. Use memcpy_erms when possible.
51
 */
52
ENTRY(memcpy_erms)
53
	movq %rdi, %rax
54
	movq %rdx, %rcx
55 56
	rep movsb
	ret
57
ENDPROC(memcpy_erms)
58

59
ENTRY(memcpy_orig)
60
	movq %rdi, %rax
61

62
	cmpq $0x20, %rdx
63
	jb .Lhandle_tail
64

I
Ingo Molnar 已提交
65
	/*
66
	 * We check whether memory false dependence could occur,
67
	 * then jump to corresponding copy mode.
I
Ingo Molnar 已提交
68
	 */
69 70
	cmp  %dil, %sil
	jl .Lcopy_backward
71
	subq $0x20, %rdx
72 73
.Lcopy_forward_loop:
	subq $0x20,	%rdx
74

I
Ingo Molnar 已提交
75
	/*
76
	 * Move in blocks of 4x8 bytes:
I
Ingo Molnar 已提交
77
	 */
78 79 80 81 82 83 84 85 86 87 88 89
	movq 0*8(%rsi),	%r8
	movq 1*8(%rsi),	%r9
	movq 2*8(%rsi),	%r10
	movq 3*8(%rsi),	%r11
	leaq 4*8(%rsi),	%rsi

	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	2*8(%rdi)
	movq %r11,	3*8(%rdi)
	leaq 4*8(%rdi),	%rdi
	jae  .Lcopy_forward_loop
90
	addl $0x20,	%edx
91 92 93 94 95 96 97 98 99 100 101
	jmp  .Lhandle_tail

.Lcopy_backward:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx,	%rsi
	addq %rdx,	%rdi
	subq $0x20,	%rdx
	/*
	 * At most 3 ALU operations in one cycle,
102
	 * so append NOPS in the same 16 bytes trunk.
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
	 */
	.p2align 4
.Lcopy_backward_loop:
	subq $0x20,	%rdx
	movq -1*8(%rsi),	%r8
	movq -2*8(%rsi),	%r9
	movq -3*8(%rsi),	%r10
	movq -4*8(%rsi),	%r11
	leaq -4*8(%rsi),	%rsi
	movq %r8,		-1*8(%rdi)
	movq %r9,		-2*8(%rdi)
	movq %r10,		-3*8(%rdi)
	movq %r11,		-4*8(%rdi)
	leaq -4*8(%rdi),	%rdi
	jae  .Lcopy_backward_loop
118

119 120 121
	/*
	 * Calculate copy position to head.
	 */
122
	addl $0x20,	%edx
123 124
	subq %rdx,	%rsi
	subq %rdx,	%rdi
125
.Lhandle_tail:
126
	cmpl $16,	%edx
127
	jb   .Lless_16bytes
I
Ingo Molnar 已提交
128

129 130 131 132 133 134 135 136 137 138 139 140
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r8
	movq 1*8(%rsi),	%r9
	movq -2*8(%rsi, %rdx),	%r10
	movq -1*8(%rsi, %rdx),	%r11
	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	-2*8(%rdi, %rdx)
	movq %r11,	-1*8(%rdi, %rdx)
	retq
141
	.p2align 4
142
.Lless_16bytes:
143
	cmpl $8,	%edx
144 145 146 147 148 149 150 151 152 153 154
	jb   .Lless_8bytes
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi),	%r8
	movq -1*8(%rsi, %rdx),	%r9
	movq %r8,	0*8(%rdi)
	movq %r9,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_8bytes:
155
	cmpl $4,	%edx
156
	jb   .Lless_3bytes
I
Ingo Molnar 已提交
157

158 159 160 161 162 163 164 165
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	retq
166
	.p2align 4
167
.Lless_3bytes:
168 169
	subl $1, %edx
	jb .Lend
170 171 172
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
173 174 175 176 177 178 179 180
	movzbl (%rsi), %ecx
	jz .Lstore_1byte
	movzbq 1(%rsi), %r8
	movzbq (%rsi, %rdx), %r9
	movb %r8b, 1(%rdi)
	movb %r9b, (%rdi, %rdx)
.Lstore_1byte:
	movb %cl, (%rdi)
181

I
Ingo Molnar 已提交
182
.Lend:
183
	retq
184
ENDPROC(memcpy_orig)
T
Tony Luck 已提交
185 186

#ifndef CONFIG_UML
187 188 189

MCSAFE_TEST_CTL

T
Tony Luck 已提交
190
/*
191
 * __memcpy_mcsafe - memory copy with machine check exception handling
T
Tony Luck 已提交
192 193 194
 * Note that we only catch machine checks when reading the source addresses.
 * Writes to target are posted and don't generate machine checks.
 */
195
ENTRY(__memcpy_mcsafe)
T
Tony Luck 已提交
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
	cmpl $8, %edx
	/* Less than 8 bytes? Go to byte copy loop */
	jb .L_no_whole_words

	/* Check for bad alignment of source */
	testl $7, %esi
	/* Already aligned */
	jz .L_8byte_aligned

	/* Copy one byte at a time until source is 8-byte aligned */
	movl %esi, %ecx
	andl $7, %ecx
	subl $8, %ecx
	negl %ecx
	subl %ecx, %edx
211
.L_read_leading_bytes:
T
Tony Luck 已提交
212
	movb (%rsi), %al
213 214
	MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
	MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
215
.L_write_leading_bytes:
T
Tony Luck 已提交
216 217 218 219
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
220
	jnz .L_read_leading_bytes
T
Tony Luck 已提交
221 222 223 224 225 226 227

.L_8byte_aligned:
	movl %edx, %ecx
	andl $7, %edx
	shrl $3, %ecx
	jz .L_no_whole_words

228
.L_read_words:
T
Tony Luck 已提交
229
	movq (%rsi), %r8
230 231
	MCSAFE_TEST_SRC %rsi 8 .E_read_words
	MCSAFE_TEST_DST %rdi 8 .E_write_words
232
.L_write_words:
233 234 235
	movq %r8, (%rdi)
	addq $8, %rsi
	addq $8, %rdi
T
Tony Luck 已提交
236
	decl %ecx
237
	jnz .L_read_words
T
Tony Luck 已提交
238 239 240 241 242 243 244 245

	/* Any trailing bytes? */
.L_no_whole_words:
	andl %edx, %edx
	jz .L_done_memcpy_trap

	/* Copy trailing bytes */
	movl %edx, %ecx
246
.L_read_trailing_bytes:
T
Tony Luck 已提交
247
	movb (%rsi), %al
248 249
	MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
	MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
250
.L_write_trailing_bytes:
T
Tony Luck 已提交
251 252 253 254
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
255
	jnz .L_read_trailing_bytes
T
Tony Luck 已提交
256

257
	/* Copy successful. Return zero */
T
Tony Luck 已提交
258
.L_done_memcpy_trap:
259
	xorl %eax, %eax
P
Peter Zijlstra 已提交
260
.L_done:
T
Tony Luck 已提交
261
	ret
262 263
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
T
Tony Luck 已提交
264 265

	.section .fixup, "ax"
266 267 268 269 270 271 272 273 274 275 276
	/*
	 * Return number of bytes not copied for any failure. Note that
	 * there is no "tail" handling since the source buffer is 8-byte
	 * aligned and poison is cacheline aligned.
	 */
.E_read_words:
	shll	$3, %ecx
.E_leading_bytes:
	addl	%edx, %ecx
.E_trailing_bytes:
	mov	%ecx, %eax
P
Peter Zijlstra 已提交
277
	jmp	.L_done
T
Tony Luck 已提交
278

279 280 281 282 283 284 285 286 287 288 289
	/*
	 * For write fault handling, given the destination is unaligned,
	 * we handle faults on multi-byte writes with a byte-by-byte
	 * copy up to the write-protected page.
	 */
.E_write_words:
	shll	$3, %ecx
	addl	%edx, %ecx
	movl	%ecx, %edx
	jmp mcsafe_handle_tail

T
Tony Luck 已提交
290 291
	.previous

292 293 294
	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
295 296 297
	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
	_ASM_EXTABLE(.L_write_words, .E_write_words)
	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
T
Tony Luck 已提交
298
#endif