From efa29e0a3f1be9336cdaaf5cb91eb7ce9ce1da64 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 24 Apr 2019 19:36:52 +0800 Subject: [PATCH] arm64: do_csum: implement accelerated scalar version hulk inclusion category: feature feature: checksum performance bugzilla: 13700 CVE: NA -------------------------------------------------- It turns out that the IP checksumming code is still exercised often, even though one might expect that modern NICs with checksum offload have no use for it. However, as Lingyan points out, there are combinations of features where the network stack may still fall back to software checksumming, and so it makes sense to provide an optimized implementation in software as well. So provide an implementation of do_csum() in scalar assembler, which, unlike C, gives direct access to the carry flag, making the code run substantially faster. The routine uses overlapping 64 byte loads for all input size > 64 bytes, in order to reduce the number of branches and improve performance on cores with deep pipelines. On Cortex-A57, this implementation is on par with Lingyan's NEON implementation, and roughly 7x as fast as the generic C code. Diff with ard's original patch: add validation check for the len. Cc: "huanglingyan (A)" Signed-off-by: Ard Biesheuvel Signed-off-by: Chen Zhou Reviewed-by: Hanjun Guo Signed-off-by: Yang Yingliang --- arch/arm64/include/asm/checksum.h | 3 + arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/csum.S | 135 ++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/csum.S diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 0b6f5a7d4027..e906b956c1fc 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } #define ip_fast_csum ip_fast_csum +extern unsigned int do_csum(const unsigned char *buff, int len); +#define do_csum do_csum + #include #endif /* __ASM_CHECKSUM_H */ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 5df2d611b77d..56c50d82148c 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ - strchr.o strrchr.o tishift.o + strchr.o strrchr.o tishift.o csum.o # Tell the compiler to treat all general purpose registers (with the # exception of the IP registers, which are already handled by the caller diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S new file mode 100644 index 000000000000..8c93c39f7c76 --- /dev/null +++ b/arch/arm64/lib/csum.S @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Linaro, Ltd. + */ + +#include +#include + +ENTRY(do_csum) + // len is zero or negative + cmp w1, wzr + b.le out + + adds x2, xzr, xzr // clear x2 and C flag + + // 64 bytes at a time + lsr x3, x1, #6 + and x1, x1, #63 + cbz x3, 1f + + // Eight 64-bit adds per iteration +0: ldp x4, x5, [x0], #64 + ldp x6, x7, [x0, #-48] + ldp x8, x9, [x0, #-32] + ldp x10, x11, [x0, #-16] + adcs x2, x2, x4 + sub x3, x3, #1 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adcs x2, x2, x8 + adcs x2, x2, x9 + adcs x2, x2, x10 + adcs x2, x2, x11 + cbnz x3, 0b + adc x2, x2, xzr + + cbz x1, 7f + bic x3, x1, #1 + add x12, x0, x1 + add x0, x0, x3 + neg x3, x3 + add x3, x3, #64 + lsl x3, x3, #3 + + // Handle remaining 63 bytes or less using an overlapping 64-byte load + // and a branchless code path to complete the calculation + ldp x4, x5, [x0, #-64] + ldp x6, x7, [x0, #-48] + ldp x8, x9, [x0, #-32] + ldp x10, x11, [x0, #-16] + ldrb w12, [x12, #-1] + + .irp reg, x4, x5, x6, x7, x8, x9, x10, x11 + cmp x3, #64 + csel \reg, \reg, xzr, lt + ccmp x3, xzr, #0, lt + csel x13, x3, xzr, gt + sub x3, x3, #64 +CPU_LE( lsr \reg, \reg, x13 ) +CPU_BE( lsl \reg, \reg, x13 ) + .endr + + adds x2, x2, x4 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adcs x2, x2, x8 + adcs x2, x2, x9 + adcs x2, x2, x10 + adcs x2, x2, x11 + adc x2, x2, xzr + +CPU_LE( adds x12, x2, x12 ) +CPU_BE( adds x12, x2, x12, lsl #8 ) + adc x12, x12, xzr + tst x1, #1 + csel x2, x2, x12, eq + +7: lsr x1, x2, #32 + adds w2, w2, w1 + adc w2, w2, wzr + + lsr w1, w2, #16 + uxth w2, w2 + add w2, w2, w1 + + lsr w1, w2, #16 // handle the carry by hand + add w2, w2, w1 + + uxth w0, w2 + ret + + // Handle 63 bytes or less +1: tbz x1, #5, 2f + ldp x4, x5, [x0], #32 + ldp x6, x7, [x0, #-16] + adds x2, x2, x4 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adc x2, x2, xzr + +2: tbz x1, #4, 3f + ldp x4, x5, [x0], #16 + adds x2, x2, x4 + adcs x2, x2, x5 + adc x2, x2, xzr + +3: tbz x1, #3, 4f + ldr x4, [x0], #8 + adds x2, x2, x4 + adc x2, x2, xzr + +4: tbz x1, #2, 5f + ldr w4, [x0], #4 + adds x2, x2, x4 + adc x2, x2, xzr + +5: tbz x1, #1, 6f + ldrh w4, [x0], #2 + adds x2, x2, x4 + adc x2, x2, xzr + +6: tbz x1, #0, 7b + ldrb w4, [x0] +CPU_LE( adds x2, x2, x4 ) +CPU_BE( adds x2, x2, x4, lsl #8 ) + adc x2, x2, xzr + b 7b + +out: + mov w0, #0 + ret +ENDPROC(do_csum) -- GitLab