arm64: lib: improve copy_page to deal with 128 bytes at a time

We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: N Will Deacon <will.deacon@arm.com> Tested-by: N Andrew Pinski <apinski@cavium.com> Signed-off-by: N Catalin Marinas <catalin.marinas@arm.com>

arm64: lib: improve copy_page to deal with 128 bytes at a time
We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: N Will Deacon <will.deacon@arm.com> Tested-by: N Andrew Pinski <apinski@cavium.com> Signed-off-by: N Catalin Marinas <catalin.marinas@arm.com>
223e23e8 · Will Deacon · Catalin Marinas · d5370f75 · 223e23e8
隐藏空白更改
内联并排

Showing with 38 addition and 8 deletion

arch/arm64/lib/copy_page.S arch/arm64/lib/copy_page.S +38 -8

未找到文件。
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -27,20 +27,50 @@
 *	x1 - src
 */
 ENTRY(copy_page)
-	/* Assume cache line size is 64 bytes. */
-	prfm	pldl1strm, [x1, #64]
-1:	ldp	x2, x3, [x1]
+	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]
 	ldp	x8, x9, [x1, #48]
-	add	x1, x1, #64
-	prfm	pldl1strm, [x1, #64]
+	ldp	x10, x11, [x1, #64]
+	ldp	x12, x13, [x1, #80]
+	ldp	x14, x15, [x1, #96]
+	ldp	x16, x17, [x1, #112]
+
+	mov	x18, #(PAGE_SIZE - 128)
+	add	x1, x1, #128
+1:
+	subs	x18, x18, #128
+
 	stnp	x2, x3, [x0]
+	ldp	x2, x3, [x1]
 	stnp	x4, x5, [x0, #16]
+	ldp	x4, x5, [x1, #16]
 	stnp	x6, x7, [x0, #32]
+	ldp	x6, x7, [x1, #32]
 	stnp	x8, x9, [x0, #48]
-	add	x0, x0, #64
-	tst	x1, #(PAGE_SIZE - 1)
-	b.ne	1b
+	ldp	x8, x9, [x1, #48]
+	stnp	x10, x11, [x0, #64]
+	ldp	x10, x11, [x1, #64]
+	stnp	x12, x13, [x0, #80]
+	ldp	x12, x13, [x1, #80]
+	stnp	x14, x15, [x0, #96]
+	ldp	x14, x15, [x1, #96]
+	stnp	x16, x17, [x0, #112]
+	ldp	x16, x17, [x1, #112]
+
+	add	x0, x0, #128
+	add	x1, x1, #128
+
+	b.gt	1b
+
+	stnp	x2, x3, [x0]
+	stnp	x4, x5, [x0, #16]
+	stnp	x6, x7, [x0, #32]
+	stnp	x8, x9, [x0, #48]
+	stnp	x10, x11, [x0, #64]
+	stnp	x12, x13, [x0, #80]
+	stnp	x14, x15, [x0, #96]
+	stnp	x16, x17, [x0, #112]
+
 	ret
 ENDPROC(copy_page)