From fce683641b9a21ac35661079993a3ccd323bf3f1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 31 Aug 2019 09:52:11 +0800 Subject: [PATCH] arm64/lib: improve CRC32 performance for deep pipelines mainline inclusion from mainline-5.0 commit: efdb25efc7645b326cd5eb82be5feeabe167c24e category: perf bugzilla: 20886 CVE: NA lib/crc32test result: [root@localhost build]# rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [83170.153209] CPU7: use cycles 26243990 [83183.122137] CPU7: use cycles 26151290 [83309.691628] CPU7: use cycles 26122830 [83312.415559] CPU7: use cycles 26232600 [83313.191479] CPU8: use cycles 26082350 rmmod crc32test && insmod lib/crc32test.ko && dmesg | grep cycles [ 1023.539931] CPU25: use cycles 12256730 [ 1024.850360] CPU24: use cycles 12249680 [ 1025.463622] CPU25: use cycles 12253330 [ 1025.862925] CPU25: use cycles 12269720 [ 1026.376038] CPU26: use cycles 12222480 Based on 13702: arm64/lib: improve CRC32 performance for deep pipelines crypto: arm64/crc32 - remove PMULL based CRC32 driver arm64/lib: add accelerated crc32 routines arm64: cpufeature: add feature for CRC32 instructions lib/crc32: make core crc32() routines weak so they can be overridden ---------------------------------------------- Improve the performance of the crc32() asm routines by getting rid of most of the branches and small sized loads on the common path. Instead, use a branchless code path involving overlapping 16 byte loads to process the first (length % 32) bytes, and process the remainder using a loop that processes 32 bytes at a time. Tested using the following test program: #include extern void crc32_le(unsigned short, char const*, int); int main(void) { static const char buf[4096]; srand(20181126); for (int i = 0; i < 100 * 1000 * 1000; i++) crc32_le(0, buf, rand() % 1024); return 0; } On Cortex-A53 and Cortex-A57, the performance regresses but only very slightly. On Cortex-A72 however, the performance improves from $ time ./crc32 real 0m10.149s user 0m10.149s sys 0m0.000s to $ time ./crc32 real 0m7.915s user 0m7.915s sys 0m0.000s Cc: Rui Sun Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Xie XiuQi Reviewed-by: Hanjun Guo Signed-off-by: Yang Yingliang --- arch/arm64/lib/crc32.S | 54 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 5bc1e85b4e1c..f132f2a7522e 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -15,15 +15,59 @@ .cpu generic+crc .macro __crc32, c -0: subs x2, x2, #16 - b.mi 8f - ldp x3, x4, [x1], #16 + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] CPU_BE( rev x3, x3 ) CPU_BE( rev x4, x4 ) +CPU_BE( rev x5, x5 ) +CPU_BE( rev x6, x6 ) + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] +CPU_BE( rev x3, x3 ) +CPU_BE( rev x4, x4 ) +CPU_BE( rev x5, x5 ) +CPU_BE( rev x6, x6 ) crc32\c\()x w0, w0, x3 crc32\c\()x w0, w0, x4 - b.ne 0b - ret + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: ret 8: tbz x2, #3, 4f ldr x3, [x1], #8 -- GitLab