memcpy_64.S 5.9 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0-only */
L
Linus Torvalds 已提交
2
/* Copyright 2002 Andi Kleen */
3

4
#include <linux/linkage.h>
5
#include <asm/errno.h>
6
#include <asm/cpufeatures.h>
7
#include <asm/mcsafe_test.h>
8
#include <asm/alternative-asm.h>
9
#include <asm/export.h>
10

11 12 13 14 15 16 17 18 19
/*
 * We build a jump to memcpy_orig by default which gets NOPped out on
 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 */

.weak memcpy

L
Linus Torvalds 已提交
20 21 22
/*
 * memcpy - Copy a memory block.
 *
I
Ingo Molnar 已提交
23 24 25 26 27
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
L
Linus Torvalds 已提交
28 29
 * Output:
 * rax original destination
I
Ingo Molnar 已提交
30
 */
31 32 33 34
ENTRY(__memcpy)
ENTRY(memcpy)
	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
		      "jmp memcpy_erms", X86_FEATURE_ERMS
L
Linus Torvalds 已提交
35

I
Ingo Molnar 已提交
36
	movq %rdi, %rax
37 38
	movq %rdx, %rcx
	shrq $3, %rcx
I
Ingo Molnar 已提交
39
	andl $7, %edx
40
	rep movsq
I
Ingo Molnar 已提交
41
	movl %edx, %ecx
42 43
	rep movsb
	ret
44 45
ENDPROC(memcpy)
ENDPROC(__memcpy)
46 47
EXPORT_SYMBOL(memcpy)
EXPORT_SYMBOL(__memcpy)
48

49
/*
50 51
 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 * simpler than memcpy. Use memcpy_erms when possible.
52
 */
53
ENTRY(memcpy_erms)
54
	movq %rdi, %rax
55
	movq %rdx, %rcx
56 57
	rep movsb
	ret
58
ENDPROC(memcpy_erms)
59

60
ENTRY(memcpy_orig)
61
	movq %rdi, %rax
62

63
	cmpq $0x20, %rdx
64
	jb .Lhandle_tail
65

I
Ingo Molnar 已提交
66
	/*
67
	 * We check whether memory false dependence could occur,
68
	 * then jump to corresponding copy mode.
I
Ingo Molnar 已提交
69
	 */
70 71
	cmp  %dil, %sil
	jl .Lcopy_backward
72
	subq $0x20, %rdx
73 74
.Lcopy_forward_loop:
	subq $0x20,	%rdx
75

I
Ingo Molnar 已提交
76
	/*
77
	 * Move in blocks of 4x8 bytes:
I
Ingo Molnar 已提交
78
	 */
79 80 81 82 83 84 85 86 87 88 89 90
	movq 0*8(%rsi),	%r8
	movq 1*8(%rsi),	%r9
	movq 2*8(%rsi),	%r10
	movq 3*8(%rsi),	%r11
	leaq 4*8(%rsi),	%rsi

	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	2*8(%rdi)
	movq %r11,	3*8(%rdi)
	leaq 4*8(%rdi),	%rdi
	jae  .Lcopy_forward_loop
91
	addl $0x20,	%edx
92 93 94 95 96 97 98 99 100 101 102
	jmp  .Lhandle_tail

.Lcopy_backward:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx,	%rsi
	addq %rdx,	%rdi
	subq $0x20,	%rdx
	/*
	 * At most 3 ALU operations in one cycle,
103
	 * so append NOPS in the same 16 bytes trunk.
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
	 */
	.p2align 4
.Lcopy_backward_loop:
	subq $0x20,	%rdx
	movq -1*8(%rsi),	%r8
	movq -2*8(%rsi),	%r9
	movq -3*8(%rsi),	%r10
	movq -4*8(%rsi),	%r11
	leaq -4*8(%rsi),	%rsi
	movq %r8,		-1*8(%rdi)
	movq %r9,		-2*8(%rdi)
	movq %r10,		-3*8(%rdi)
	movq %r11,		-4*8(%rdi)
	leaq -4*8(%rdi),	%rdi
	jae  .Lcopy_backward_loop
119

120 121 122
	/*
	 * Calculate copy position to head.
	 */
123
	addl $0x20,	%edx
124 125
	subq %rdx,	%rsi
	subq %rdx,	%rdi
126
.Lhandle_tail:
127
	cmpl $16,	%edx
128
	jb   .Lless_16bytes
I
Ingo Molnar 已提交
129

130 131 132 133 134 135 136 137 138 139 140 141
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r8
	movq 1*8(%rsi),	%r9
	movq -2*8(%rsi, %rdx),	%r10
	movq -1*8(%rsi, %rdx),	%r11
	movq %r8,	0*8(%rdi)
	movq %r9,	1*8(%rdi)
	movq %r10,	-2*8(%rdi, %rdx)
	movq %r11,	-1*8(%rdi, %rdx)
	retq
142
	.p2align 4
143
.Lless_16bytes:
144
	cmpl $8,	%edx
145 146 147 148 149 150 151 152 153 154 155
	jb   .Lless_8bytes
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi),	%r8
	movq -1*8(%rsi, %rdx),	%r9
	movq %r8,	0*8(%rdi)
	movq %r9,	-1*8(%rdi, %rdx)
	retq
	.p2align 4
.Lless_8bytes:
156
	cmpl $4,	%edx
157
	jb   .Lless_3bytes
I
Ingo Molnar 已提交
158

159 160 161 162 163 164 165 166
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %ecx
	movl -4(%rsi, %rdx), %r8d
	movl %ecx, (%rdi)
	movl %r8d, -4(%rdi, %rdx)
	retq
167
	.p2align 4
168
.Lless_3bytes:
169 170
	subl $1, %edx
	jb .Lend
171 172 173
	/*
	 * Move data from 1 bytes to 3 bytes.
	 */
174 175 176 177 178 179 180 181
	movzbl (%rsi), %ecx
	jz .Lstore_1byte
	movzbq 1(%rsi), %r8
	movzbq (%rsi, %rdx), %r9
	movb %r8b, 1(%rdi)
	movb %r9b, (%rdi, %rdx)
.Lstore_1byte:
	movb %cl, (%rdi)
182

I
Ingo Molnar 已提交
183
.Lend:
184
	retq
185
ENDPROC(memcpy_orig)
T
Tony Luck 已提交
186 187

#ifndef CONFIG_UML
188 189 190

MCSAFE_TEST_CTL

T
Tony Luck 已提交
191
/*
192
 * __memcpy_mcsafe - memory copy with machine check exception handling
T
Tony Luck 已提交
193 194 195
 * Note that we only catch machine checks when reading the source addresses.
 * Writes to target are posted and don't generate machine checks.
 */
196
ENTRY(__memcpy_mcsafe)
T
Tony Luck 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
	cmpl $8, %edx
	/* Less than 8 bytes? Go to byte copy loop */
	jb .L_no_whole_words

	/* Check for bad alignment of source */
	testl $7, %esi
	/* Already aligned */
	jz .L_8byte_aligned

	/* Copy one byte at a time until source is 8-byte aligned */
	movl %esi, %ecx
	andl $7, %ecx
	subl $8, %ecx
	negl %ecx
	subl %ecx, %edx
212
.L_read_leading_bytes:
T
Tony Luck 已提交
213
	movb (%rsi), %al
214 215
	MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
	MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
216
.L_write_leading_bytes:
T
Tony Luck 已提交
217 218 219 220
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
221
	jnz .L_read_leading_bytes
T
Tony Luck 已提交
222 223 224 225 226 227 228

.L_8byte_aligned:
	movl %edx, %ecx
	andl $7, %edx
	shrl $3, %ecx
	jz .L_no_whole_words

229
.L_read_words:
T
Tony Luck 已提交
230
	movq (%rsi), %r8
231 232
	MCSAFE_TEST_SRC %rsi 8 .E_read_words
	MCSAFE_TEST_DST %rdi 8 .E_write_words
233
.L_write_words:
234 235 236
	movq %r8, (%rdi)
	addq $8, %rsi
	addq $8, %rdi
T
Tony Luck 已提交
237
	decl %ecx
238
	jnz .L_read_words
T
Tony Luck 已提交
239 240 241 242 243 244 245 246

	/* Any trailing bytes? */
.L_no_whole_words:
	andl %edx, %edx
	jz .L_done_memcpy_trap

	/* Copy trailing bytes */
	movl %edx, %ecx
247
.L_read_trailing_bytes:
T
Tony Luck 已提交
248
	movb (%rsi), %al
249 250
	MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
	MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
251
.L_write_trailing_bytes:
T
Tony Luck 已提交
252 253 254 255
	movb %al, (%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
256
	jnz .L_read_trailing_bytes
T
Tony Luck 已提交
257

258
	/* Copy successful. Return zero */
T
Tony Luck 已提交
259
.L_done_memcpy_trap:
260
	xorl %eax, %eax
P
Peter Zijlstra 已提交
261
.L_done:
T
Tony Luck 已提交
262
	ret
263 264
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
T
Tony Luck 已提交
265 266

	.section .fixup, "ax"
267 268 269 270 271 272 273 274 275 276 277
	/*
	 * Return number of bytes not copied for any failure. Note that
	 * there is no "tail" handling since the source buffer is 8-byte
	 * aligned and poison is cacheline aligned.
	 */
.E_read_words:
	shll	$3, %ecx
.E_leading_bytes:
	addl	%edx, %ecx
.E_trailing_bytes:
	mov	%ecx, %eax
P
Peter Zijlstra 已提交
278
	jmp	.L_done
T
Tony Luck 已提交
279

280 281 282 283 284 285 286 287 288 289 290
	/*
	 * For write fault handling, given the destination is unaligned,
	 * we handle faults on multi-byte writes with a byte-by-byte
	 * copy up to the write-protected page.
	 */
.E_write_words:
	shll	$3, %ecx
	addl	%edx, %ecx
	movl	%ecx, %edx
	jmp mcsafe_handle_tail

T
Tony Luck 已提交
291 292
	.previous

293 294 295
	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
296 297 298
	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
	_ASM_EXTABLE(.L_write_words, .E_write_words)
	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
T
Tony Luck 已提交
299
#endif