arm64/lib: improve CRC32 performance for deep pipelines

mainline inclusion from mainline-5.0 commit: efdb25efc7645b326cd5eb82be5feeabe167c24e category: perf bugzilla: 20886 CVE: NA lib/crc32test result: [root@localhost build]# rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [83170.153209] CPU7: use cycles 26243990 [83183.122137] CPU7: use cycles 26151290 [83309.691628] CPU7: use cycles 26122830 [83312.415559] CPU7: use cycles 26232600 [83313.191479] CPU8: use cycles 26082350 rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [ 1023.539931] CPU25: use cycles 12256730 [ 1024.850360] CPU24: use cycles 12249680 [ 1025.463622] CPU25: use cycles 12253330 [ 1025.862925] CPU25: use cycles 12269720 [ 1026.376038] CPU26: use cycles 12222480 Based on 13702: arm64/lib: improve CRC32 performance for deep pipelines crypto: arm64/crc32 - remove PMULL based CRC32 driver arm64/lib: add accelerated crc32 routines arm64: cpufeature: add feature for CRC32 instructions lib/crc32: make core crc32() routines weak so they can be overridden ---------------------------------------------- Improve the performance of the crc32() asm routines by getting rid of most of the branches and small sized loads on the common path. Instead, use a branchless code path involving overlapping 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time. Tested using the following test program: #include <stdlib.h> extern void crc32_le(unsigned short, char const*, int); int main(void) { static const char buf[4096]; srand(20181126); for (int i = 0; i < 100 * 1000 * 1000; i++) crc32_le(0, buf, rand() % 1024); return 0; } On Cortex-A53 and Cortex-A57, the performance regresses but only very slightly. On Cortex-A72 however, the performance improves from $ time ./crc32 real 0m10.149s user 0m10.149s sys 0m0.000s to $ time ./crc32 real 0m7.915s user 0m7.915s sys 0m0.000s Cc: Rui Sun <sunrui26@huawei.com> Signed-off-by: N Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: N Will Deacon <will.deacon@arm.com> Signed-off-by: N Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: N Hanjun Guo <guohanjun@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>

arm64/lib: improve CRC32 performance for deep pipelines
mainline inclusion from mainline-5.0 commit: efdb25efc7645b326cd5eb82be5feeabe167c24e category: perf bugzilla: 20886 CVE: NA lib/crc32test result: [root@localhost build]# rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [83170.153209] CPU7: use cycles 26243990 [83183.122137] CPU7: use cycles 26151290 [83309.691628] CPU7: use cycles 26122830 [83312.415559] CPU7: use cycles 26232600 [83313.191479] CPU8: use cycles 26082350 rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [ 1023.539931] CPU25: use cycles 12256730 [ 1024.850360] CPU24: use cycles 12249680 [ 1025.463622] CPU25: use cycles 12253330 [ 1025.862925] CPU25: use cycles 12269720 [ 1026.376038] CPU26: use cycles 12222480 Based on 13702: arm64/lib: improve CRC32 performance for deep pipelines crypto: arm64/crc32 - remove PMULL based CRC32 driver arm64/lib: add accelerated crc32 routines arm64: cpufeature: add feature for CRC32 instructions lib/crc32: make core crc32() routines weak so they can be overridden ---------------------------------------------- Improve the performance of the crc32() asm routines by getting rid of most of the branches and small sized loads on the common path. Instead, use a branchless code path involving overlapping 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time. Tested using the following test program: #include <stdlib.h> extern void crc32_le(unsigned short, char const*, int); int main(void) { static const char buf[4096]; srand(20181126); for (int i = 0; i < 100 * 1000 * 1000; i++) crc32_le(0, buf, rand() % 1024); return 0; } On Cortex-A53 and Cortex-A57, the performance regresses but only very slightly. On Cortex-A72 however, the performance improves from $ time ./crc32 real 0m10.149s user 0m10.149s sys 0m0.000s to $ time ./crc32 real 0m7.915s user 0m7.915s sys 0m0.000s Cc: Rui Sun <sunrui26@huawei.com> Signed-off-by: N Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: N Will Deacon <will.deacon@arm.com> Signed-off-by: N Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: N Hanjun Guo <guohanjun@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>
fce68364 · Ard Biesheuvel · Xie XiuQi · 0066ba62 · fce68364
隐藏空白更改
内联并排

Showing with 49 addition and 5 deletion

arch/arm64/lib/crc32.S arch/arm64/lib/crc32.S +49 -5

未找到文件。
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -15,15 +15,59 @@
 	.cpu		generic+crc

 	.macro		__crc32, c
-0:	subs		x2, x2, #16
-	b.mi		8f
-	ldp		x3, x4, [x1], #16
+	cmp		x2, #16
+	b.lt		8f			// less than 16 bytes
+
+	and		x7, x2, #0x1f
+	and		x2, x2, #~0x1f
+	cbz		x7, 32f			// multiple of 32 bytes
+
+	and		x8, x7, #0xf
+	ldp		x3, x4, [x1]
+	add		x8, x8, x1
+	add		x1, x1, x7
+	ldp		x5, x6, [x8]
 CPU_BE(	rev		x3, x3		)
 CPU_BE(	rev		x4, x4		)
+CPU_BE(	rev		x5, x5		)
+CPU_BE(	rev		x6, x6		)
+
+	tst		x7, #8
+	crc32\c\()x	w8, w0, x3
+	csel		x3, x3, x4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #4
+	lsr		x4, x3, #32
+	crc32\c\()w	w8, w0, w3
+	csel		x3, x3, x4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #2
+	lsr		w4, w3, #16
+	crc32\c\()h	w8, w0, w3
+	csel		w3, w3, w4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #1
+	crc32\c\()b	w8, w0, w3
+	csel		w0, w0, w8, eq
+	tst		x7, #16
+	crc32\c\()x	w8, w0, x5
+	crc32\c\()x	w8, w8, x6
+	csel		w0, w0, w8, eq
+	cbz		x2, 0f
+
+32:	ldp		x3, x4, [x1], #32
+	sub		x2, x2, #32
+	ldp		x5, x6, [x1, #-16]
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+CPU_BE(	rev		x5, x5		)
+CPU_BE(	rev		x6, x6		)
 	crc32\c\()x	w0, w0, x3
 	crc32\c\()x	w0, w0, x4
-	b.ne		0b
-	ret
+	crc32\c\()x	w0, w0, x5
+	crc32\c\()x	w0, w0, x6
+	cbnz		x2, 32b
+0:	ret

 8:	tbz		x2, #3, 4f
 	ldr		x3, [x1], #8