Message ID | 20190218230842.11448-1-ard.biesheuvel@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: do_csum: implement accelerated scalar version | expand |
On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote: > It turns out that the IP checksumming code is still exercised often, > even though one might expect that modern NICs with checksum offload > have no use for it. However, as Lingyan points out, there are > combinations of features where the network stack may still fall back > to software checksumming, and so it makes sense to provide an > optimized implementation in software as well. > > So provide an implementation of do_csum() in scalar assembler, which, > unlike C, gives direct access to the carry flag, making the code run > substantially faster. The routine uses overlapping 64 byte loads for > all input size > 64 bytes, in order to reduce the number of branches > and improve performance on cores with deep pipelines. > > On Cortex-A57, this implementation is on par with Lingyan's NEON > implementation, and roughly 7x as fast as the generic C code. > > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > --- > Test code after the patch. > > arch/arm64/include/asm/checksum.h | 3 + > arch/arm64/lib/Makefile | 2 +- > arch/arm64/lib/csum.S | 127 ++++++++++++++++++++ > 3 files changed, 131 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h > index 0b6f5a7d4027..e906b956c1fc 100644 > --- a/arch/arm64/include/asm/checksum.h > +++ b/arch/arm64/include/asm/checksum.h > @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) > } > #define ip_fast_csum ip_fast_csum > > +extern unsigned int do_csum(const unsigned char *buff, int len); > +#define do_csum do_csum > + > #include <asm-generic/checksum.h> > > #endif /* __ASM_CHECKSUM_H */ > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile > index 5540a1638baf..a7606007a749 100644 > --- a/arch/arm64/lib/Makefile > +++ b/arch/arm64/lib/Makefile > @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \ > copy_to_user.o copy_in_user.o copy_page.o \ > clear_page.o memchr.o memcpy.o memmove.o memset.o \ > memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ > - strchr.o strrchr.o tishift.o > + strchr.o strrchr.o tishift.o csum.o > > ifeq ($(CONFIG_KERNEL_MODE_NEON), y) > obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o > diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S > new file mode 100644 > index 000000000000..534e2ebdc426 > --- /dev/null > +++ b/arch/arm64/lib/csum.S > @@ -0,0 +1,127 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> > + */ > + > +#include <linux/linkage.h> > +#include <asm/assembler.h> > + > +ENTRY(do_csum) > + adds x2, xzr, xzr // clear x2 and C flag > + > + // 64 bytes at a time > + lsr x3, x1, #6 > + and x1, x1, #63 > + cbz x3, 1f > + > + // Eight 64-bit adds per iteration > +0: ldp x4, x5, [x0], #64 > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + adcs x2, x2, x4 > + sub x3, x3, #1 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + cbnz x3, 0b > + adc x2, x2, xzr > + > + cbz x1, 7f > + bic x3, x1, #1 > + add x12, x0, x1 > + add x0, x0, x3 > + neg x3, x3 > + add x3, x3, #64 > + lsl x3, x3, #3 > + > + // Handle remaining 63 bytes or less using an overlapping 64-byte load > + // and a branchless code path to complete the calculation > + ldp x4, x5, [x0, #-64] > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + ldrb w12, [x12, #-1] > + > + .irp reg, x4, x5, x6, x7, x8, x9, x10, x11 > + cmp x3, #64 > + csel \reg, \reg, xzr, lt > + ccmp x3, xzr, #0, lt > + csel x13, x3, xzr, gt > + sub x3, x3, #64 > +CPU_LE( lsr \reg, \reg, x13 ) > +CPU_BE( lsl \reg, \reg, x13 ) > + .endr > + > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + adc x2, x2, xzr > + > +CPU_LE( adds x12, x2, x12 ) > +CPU_BE( adds x12, x2, x12, lsl #8 ) > + adc x12, x12, xzr > + tst x1, #1 > + csel x2, x2, x12, eq > + > +7: lsr x1, x2, #32 > + adds w2, w2, w1 > + adc w2, w2, wzr > + > + lsr w1, w2, #16 > + uxth w2, w2 > + add w2, w2, w1 > + > + lsr w1, w2, #16 // handle the carry by hand > + add w2, w2, w1 > + > + uxth w0, w2 > + ret > + > + // Handle 63 bytes or less > +1: tbz x1, #5, 2f > + ldp x4, x5, [x0], #32 > + ldp x6, x7, [x0, #-16] > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adc x2, x2, xzr > + > +2: tbz x1, #4, 3f > + ldp x4, x5, [x0], #16 > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adc x2, x2, xzr > + > +3: tbz x1, #3, 4f > + ldr x4, [x0], #8 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +4: tbz x1, #2, 5f > + ldr w4, [x0], #4 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +5: tbz x1, #1, 6f > + ldrh w4, [x0], #2 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +6: tbz x1, #0, 7b > + ldrb w4, [x0] > +CPU_LE( adds x2, x2, x4 ) > +CPU_BE( adds x2, x2, x4, lsl #8 ) > + adc x2, x2, xzr > + b 7b > +ENDPROC(do_csum) > -- > 2.20.1 > > diff --git a/lib/checksum.c b/lib/checksum.c > index d3ec93f9e5f3..7711f1186f71 100644 > --- a/lib/checksum.c > +++ b/lib/checksum.c > @@ -37,7 +37,7 @@ > > #include <asm/byteorder.h> > > -#ifndef do_csum > +#if 1 //ndef do_csum > static inline unsigned short from32to16(unsigned int x) > { > /* add up 16-bit and 16-bit for 16+c bit */ > @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x) > return x; > } > > -static unsigned int do_csum(const unsigned char *buff, int len) > +static unsigned int __do_csum(const unsigned char *buff, int len) > { > int odd; > unsigned int result = 0; > @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, > } > EXPORT_SYMBOL(csum_tcpudp_nofold); > #endif > + > +extern u8 crypto_ft_tab[]; > + > +static int __init do_selftest(void) > +{ > + int i, j; > + u16 c1, c2; > + > + for (i = 0; i < 1024; i++) { > + for (j = i + 1; j <= 1024; j++) { > + c1 = __do_csum(crypto_ft_tab + i, j - i); > + c2 = do_csum(crypto_ft_tab + i, j - i); > + > + if (c1 != c2) > + pr_err("######### %d %d %x %x\n", i, j, c1, c2); > + } > + } > + return 0; > +} > +late_initcall(do_selftest); Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
(+ Catalin) On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote: > > On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote: > > It turns out that the IP checksumming code is still exercised often, > > even though one might expect that modern NICs with checksum offload > > have no use for it. However, as Lingyan points out, there are > > combinations of features where the network stack may still fall back > > to software checksumming, and so it makes sense to provide an > > optimized implementation in software as well. > > > > So provide an implementation of do_csum() in scalar assembler, which, > > unlike C, gives direct access to the carry flag, making the code run > > substantially faster. The routine uses overlapping 64 byte loads for > > all input size > 64 bytes, in order to reduce the number of branches > > and improve performance on cores with deep pipelines. > > > > On Cortex-A57, this implementation is on par with Lingyan's NEON > > implementation, and roughly 7x as fast as the generic C code. > > > > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> > > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> ... > > Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Full patch here https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/ This was a follow-up to some discussions about Lingyan's NEON code, CC'ed to netdev@ so people could chime in as to whether we need accelerated checksumming code in the first place.
Hi Ard, On 28/02/2019 14:16, Ard Biesheuvel wrote: > (+ Catalin) > > On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas > <ilias.apalodimas@linaro.org> wrote: >> >> On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote: >>> It turns out that the IP checksumming code is still exercised often, >>> even though one might expect that modern NICs with checksum offload >>> have no use for it. However, as Lingyan points out, there are >>> combinations of features where the network stack may still fall back >>> to software checksumming, and so it makes sense to provide an >>> optimized implementation in software as well. >>> >>> So provide an implementation of do_csum() in scalar assembler, which, >>> unlike C, gives direct access to the carry flag, making the code run >>> substantially faster. The routine uses overlapping 64 byte loads for >>> all input size > 64 bytes, in order to reduce the number of branches >>> and improve performance on cores with deep pipelines. >>> >>> On Cortex-A57, this implementation is on par with Lingyan's NEON >>> implementation, and roughly 7x as fast as the generic C code. >>> >>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> >>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > ... >> >> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> > > Full patch here > > https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/ > > This was a follow-up to some discussions about Lingyan's NEON code, > CC'ed to netdev@ so people could chime in as to whether we need > accelerated checksumming code in the first place. FWIW ever since we did ip_fast_csum() I've been meaning to see how well I can do with a similar tweaked C implementation for this (mostly for fun). Since I've recently dug out my RK3328 box for other reasons I'll give this a test - that's a weedy little quad-A53 whose GbE hardware checksumming is slightly busted and has to be turned off, so the do_csum() overhead under heavy network load is comparatively massive. (plus it's non-EFI so I should be able to try big-endian easily too) The asm looks pretty reasonable to me - instinct says there's *possibly* some value for out-of-order cores in doing the 8-way accumulations in a more pairwise fashion, but I guess either way the carry flag dependency is going to dominate, so it may well be moot. What may be more worthwhile is taking the effort to align the source pointer, at least for larger inputs, so as to be kinder to little cores - according to its optimisation guide, A55 is fairly sensitive to unaligned loads, so I'd assume that's true of its older/smaller friends too. I'll see what I can measure in practice - until proven otherwise I'd have no great objection to merging this patch as-is if the need is real. Improvements can always come later :) Robin.
On Thu, 28 Feb 2019 at 16:14, Robin Murphy <robin.murphy@arm.com> wrote: > > Hi Ard, > > On 28/02/2019 14:16, Ard Biesheuvel wrote: > > (+ Catalin) > > > > On Tue, 19 Feb 2019 at 16:08, Ilias Apalodimas > > <ilias.apalodimas@linaro.org> wrote: > >> > >> On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote: > >>> It turns out that the IP checksumming code is still exercised often, > >>> even though one might expect that modern NICs with checksum offload > >>> have no use for it. However, as Lingyan points out, there are > >>> combinations of features where the network stack may still fall back > >>> to software checksumming, and so it makes sense to provide an > >>> optimized implementation in software as well. > >>> > >>> So provide an implementation of do_csum() in scalar assembler, which, > >>> unlike C, gives direct access to the carry flag, making the code run > >>> substantially faster. The routine uses overlapping 64 byte loads for > >>> all input size > 64 bytes, in order to reduce the number of branches > >>> and improve performance on cores with deep pipelines. > >>> > >>> On Cortex-A57, this implementation is on par with Lingyan's NEON > >>> implementation, and roughly 7x as fast as the generic C code. > >>> > >>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> > >>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > > ... > >> > >> Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> > > > > Full patch here > > > > https://lore.kernel.org/linux-arm-kernel/20190218230842.11448-1-ard.biesheuvel@linaro.org/ > > > > This was a follow-up to some discussions about Lingyan's NEON code, > > CC'ed to netdev@ so people could chime in as to whether we need > > accelerated checksumming code in the first place. Thanks for taking a look. > FWIW ever since we did ip_fast_csum() I've been meaning to see how well > I can do with a similar tweaked C implementation for this (mostly for > fun). Since I've recently dug out my RK3328 box for other reasons I'll > give this a test - that's a weedy little quad-A53 whose GbE hardware > checksumming is slightly busted and has to be turned off, so the > do_csum() overhead under heavy network load is comparatively massive. > (plus it's non-EFI so I should be able to try big-endian easily too) > Yes please. I've been meaning to run this on A72 myself, but ever since my MacchiatoBin self-combusted, I've been relying on AWS for this, which is a bit finicky. As for the C implementation, not having access to the carry flag is pretty limiting, so I wonder how you intend to get around that. > The asm looks pretty reasonable to me - instinct says there's *possibly* > some value for out-of-order cores in doing the 8-way accumulations in a > more pairwise fashion, but I guess either way the carry flag dependency > is going to dominate, so it may well be moot. Yes. In fact, I was surprised the speedup is as dramatic as it is despite of this, but I guess they optimize for this rather well at the uarch level. > What may be more > worthwhile is taking the effort to align the source pointer, at least > for larger inputs, so as to be kinder to little cores - according to its > optimisation guide, A55 is fairly sensitive to unaligned loads, so I'd > assume that's true of its older/smaller friends too. I'll see what I can > measure in practice - until proven otherwise I'd have no great objection > to merging this patch as-is if the need is real. Improvements can always > come later :) > Good point re alignment, I didn't consider that at all tbh. I'll let the maintainers decide whether/when to merge this. I don't feel strongly either way.
Hi maintainers and Ard, Any update on it? Thanks, Shaokun On 2019/2/19 7:08, Ard Biesheuvel wrote: > It turns out that the IP checksumming code is still exercised often, > even though one might expect that modern NICs with checksum offload > have no use for it. However, as Lingyan points out, there are > combinations of features where the network stack may still fall back > to software checksumming, and so it makes sense to provide an > optimized implementation in software as well. > > So provide an implementation of do_csum() in scalar assembler, which, > unlike C, gives direct access to the carry flag, making the code run > substantially faster. The routine uses overlapping 64 byte loads for > all input size > 64 bytes, in order to reduce the number of branches > and improve performance on cores with deep pipelines. > > On Cortex-A57, this implementation is on par with Lingyan's NEON > implementation, and roughly 7x as fast as the generic C code. > > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > --- > Test code after the patch. > > arch/arm64/include/asm/checksum.h | 3 + > arch/arm64/lib/Makefile | 2 +- > arch/arm64/lib/csum.S | 127 ++++++++++++++++++++ > 3 files changed, 131 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h > index 0b6f5a7d4027..e906b956c1fc 100644 > --- a/arch/arm64/include/asm/checksum.h > +++ b/arch/arm64/include/asm/checksum.h > @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) > } > #define ip_fast_csum ip_fast_csum > > +extern unsigned int do_csum(const unsigned char *buff, int len); > +#define do_csum do_csum > + > #include <asm-generic/checksum.h> > > #endif /* __ASM_CHECKSUM_H */ > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile > index 5540a1638baf..a7606007a749 100644 > --- a/arch/arm64/lib/Makefile > +++ b/arch/arm64/lib/Makefile > @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \ > copy_to_user.o copy_in_user.o copy_page.o \ > clear_page.o memchr.o memcpy.o memmove.o memset.o \ > memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ > - strchr.o strrchr.o tishift.o > + strchr.o strrchr.o tishift.o csum.o > > ifeq ($(CONFIG_KERNEL_MODE_NEON), y) > obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o > diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S > new file mode 100644 > index 000000000000..534e2ebdc426 > --- /dev/null > +++ b/arch/arm64/lib/csum.S > @@ -0,0 +1,127 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> > + */ > + > +#include <linux/linkage.h> > +#include <asm/assembler.h> > + > +ENTRY(do_csum) > + adds x2, xzr, xzr // clear x2 and C flag > + > + // 64 bytes at a time > + lsr x3, x1, #6 > + and x1, x1, #63 > + cbz x3, 1f > + > + // Eight 64-bit adds per iteration > +0: ldp x4, x5, [x0], #64 > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + adcs x2, x2, x4 > + sub x3, x3, #1 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + cbnz x3, 0b > + adc x2, x2, xzr > + > + cbz x1, 7f > + bic x3, x1, #1 > + add x12, x0, x1 > + add x0, x0, x3 > + neg x3, x3 > + add x3, x3, #64 > + lsl x3, x3, #3 > + > + // Handle remaining 63 bytes or less using an overlapping 64-byte load > + // and a branchless code path to complete the calculation > + ldp x4, x5, [x0, #-64] > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + ldrb w12, [x12, #-1] > + > + .irp reg, x4, x5, x6, x7, x8, x9, x10, x11 > + cmp x3, #64 > + csel \reg, \reg, xzr, lt > + ccmp x3, xzr, #0, lt > + csel x13, x3, xzr, gt > + sub x3, x3, #64 > +CPU_LE( lsr \reg, \reg, x13 ) > +CPU_BE( lsl \reg, \reg, x13 ) > + .endr > + > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + adc x2, x2, xzr > + > +CPU_LE( adds x12, x2, x12 ) > +CPU_BE( adds x12, x2, x12, lsl #8 ) > + adc x12, x12, xzr > + tst x1, #1 > + csel x2, x2, x12, eq > + > +7: lsr x1, x2, #32 > + adds w2, w2, w1 > + adc w2, w2, wzr > + > + lsr w1, w2, #16 > + uxth w2, w2 > + add w2, w2, w1 > + > + lsr w1, w2, #16 // handle the carry by hand > + add w2, w2, w1 > + > + uxth w0, w2 > + ret > + > + // Handle 63 bytes or less > +1: tbz x1, #5, 2f > + ldp x4, x5, [x0], #32 > + ldp x6, x7, [x0, #-16] > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adc x2, x2, xzr > + > +2: tbz x1, #4, 3f > + ldp x4, x5, [x0], #16 > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adc x2, x2, xzr > + > +3: tbz x1, #3, 4f > + ldr x4, [x0], #8 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +4: tbz x1, #2, 5f > + ldr w4, [x0], #4 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +5: tbz x1, #1, 6f > + ldrh w4, [x0], #2 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +6: tbz x1, #0, 7b > + ldrb w4, [x0] > +CPU_LE( adds x2, x2, x4 ) > +CPU_BE( adds x2, x2, x4, lsl #8 ) > + adc x2, x2, xzr > + b 7b > +ENDPROC(do_csum) >
On Fri, Apr 12, 2019 at 10:31:16AM +0800, Zhangshaokun wrote: > On 2019/2/19 7:08, Ard Biesheuvel wrote: > > It turns out that the IP checksumming code is still exercised often, > > even though one might expect that modern NICs with checksum offload > > have no use for it. However, as Lingyan points out, there are > > combinations of features where the network stack may still fall back > > to software checksumming, and so it makes sense to provide an > > optimized implementation in software as well. > > > > So provide an implementation of do_csum() in scalar assembler, which, > > unlike C, gives direct access to the carry flag, making the code run > > substantially faster. The routine uses overlapping 64 byte loads for > > all input size > 64 bytes, in order to reduce the number of branches > > and improve performance on cores with deep pipelines. > > > > On Cortex-A57, this implementation is on par with Lingyan's NEON > > implementation, and roughly 7x as fast as the generic C code. > > > > Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> > > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> > > --- > > Test code after the patch. > > Hi maintainers and Ard, > > Any update on it? I'm waiting for Robin to come back with numbers for a C implementation. Robin -- did you get anywhere with that? Will
On 12/04/2019 10:52, Will Deacon wrote: > On Fri, Apr 12, 2019 at 10:31:16AM +0800, Zhangshaokun wrote: >> On 2019/2/19 7:08, Ard Biesheuvel wrote: >>> It turns out that the IP checksumming code is still exercised often, >>> even though one might expect that modern NICs with checksum offload >>> have no use for it. However, as Lingyan points out, there are >>> combinations of features where the network stack may still fall back >>> to software checksumming, and so it makes sense to provide an >>> optimized implementation in software as well. >>> >>> So provide an implementation of do_csum() in scalar assembler, which, >>> unlike C, gives direct access to the carry flag, making the code run >>> substantially faster. The routine uses overlapping 64 byte loads for >>> all input size > 64 bytes, in order to reduce the number of branches >>> and improve performance on cores with deep pipelines. >>> >>> On Cortex-A57, this implementation is on par with Lingyan's NEON >>> implementation, and roughly 7x as fast as the generic C code. >>> >>> Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> >>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> >>> --- >>> Test code after the patch. >> >> Hi maintainers and Ard, >> >> Any update on it? > > I'm waiting for Robin to come back with numbers for a C implementation. > > Robin -- did you get anywhere with that? Still not what I would call finished, but where I've got so far (besides an increasingly elaborate test rig) is as below - it still wants some unrolling in the middle to really fly (and actual testing on BE), but the worst-case performance already equals or just beats this asm version on Cortex-A53 with GCC 7 (by virtue of being alignment-insensitive and branchless except for the loop). Unfortunately, the advantage of C code being instrumentable does also come around to bite me... Robin. ----->8----- /* Looks dumb, but generates nice-ish code */ static u64 accumulate(u64 sum, u64 data) { __uint128_t tmp = (__uint128_t)sum + data; return tmp + (tmp >> 64); } unsigned int do_csum_c(const unsigned char *buff, int len) { unsigned int offset, shift, sum, count; u64 data, *ptr; u64 sum64 = 0; offset = (unsigned long)buff & 0x7; /* * This is to all intents and purposes safe, since rounding down cannot * result in a different page or cache line being accessed, and @buff * should absolutely not be pointing to anything read-sensitive. * It does, however, piss off KASAN... */ ptr = (u64 *)(buff - offset); shift = offset * 8; /* * Head: zero out any excess leading bytes. Shifting back by the same * amount should be at least as fast as any other way of handling the * odd/even alignment, and means we can ignore it until the very end. */ data = *ptr++; #ifdef __LITTLE_ENDIAN data = (data >> shift) << shift; #else data = (data << shift) >> shift; #endif count = 8 - offset; /* Body: straightforward aligned loads from here on... */ //TODO: fancy stuff with larger strides and uint128s? while(len > count) { sum64 = accumulate(sum64, data); data = *ptr++; count += 8; } /* * Tail: zero any over-read bytes similarly to the head, again * preserving odd/even alignment. */ shift = (count - len) * 8; #ifdef __LITTLE_ENDIAN data = (data << shift) >> shift; #else data = (data >> shift) << shift; #endif sum64 = accumulate(sum64, data); /* Finally, folding */ sum64 += (sum64 >> 32) | (sum64 << 32); sum = sum64 >> 32; sum += (sum >> 16) | (sum << 16); if (offset & 1) return (u16)swab32(sum); return sum >> 16; }
On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: > On 12/04/2019 10:52, Will Deacon wrote: > > I'm waiting for Robin to come back with numbers for a C implementation. > > > > Robin -- did you get anywhere with that? > > Still not what I would call finished, but where I've got so far (besides an > increasingly elaborate test rig) is as below - it still wants some unrolling > in the middle to really fly (and actual testing on BE), but the worst-case > performance already equals or just beats this asm version on Cortex-A53 with > GCC 7 (by virtue of being alignment-insensitive and branchless except for > the loop). Unfortunately, the advantage of C code being instrumentable does > also come around to bite me... Is there any interest from anybody in spinning a proper patch out of this? Shaokun? Will > /* Looks dumb, but generates nice-ish code */ > static u64 accumulate(u64 sum, u64 data) > { > __uint128_t tmp = (__uint128_t)sum + data; > return tmp + (tmp >> 64); > } > > unsigned int do_csum_c(const unsigned char *buff, int len) > { > unsigned int offset, shift, sum, count; > u64 data, *ptr; > u64 sum64 = 0; > > offset = (unsigned long)buff & 0x7; > /* > * This is to all intents and purposes safe, since rounding down cannot > * result in a different page or cache line being accessed, and @buff > * should absolutely not be pointing to anything read-sensitive. > * It does, however, piss off KASAN... > */ > ptr = (u64 *)(buff - offset); > shift = offset * 8; > > /* > * Head: zero out any excess leading bytes. Shifting back by the same > * amount should be at least as fast as any other way of handling the > * odd/even alignment, and means we can ignore it until the very end. > */ > data = *ptr++; > #ifdef __LITTLE_ENDIAN > data = (data >> shift) << shift; > #else > data = (data << shift) >> shift; > #endif > count = 8 - offset; > > /* Body: straightforward aligned loads from here on... */ > //TODO: fancy stuff with larger strides and uint128s? > while(len > count) { > sum64 = accumulate(sum64, data); > data = *ptr++; > count += 8; > } > /* > * Tail: zero any over-read bytes similarly to the head, again > * preserving odd/even alignment. > */ > shift = (count - len) * 8; > #ifdef __LITTLE_ENDIAN > data = (data << shift) >> shift; > #else > data = (data >> shift) << shift; > #endif > sum64 = accumulate(sum64, data); > > /* Finally, folding */ > sum64 += (sum64 >> 32) | (sum64 << 32); > sum = sum64 >> 32; > sum += (sum >> 16) | (sum << 16); > if (offset & 1) > return (u16)swab32(sum); > > return sum >> 16; > }
... > > ptr = (u64 *)(buff - offset); > > shift = offset * 8; > > > > /* > > * Head: zero out any excess leading bytes. Shifting back by the same > > * amount should be at least as fast as any other way of handling the > > * odd/even alignment, and means we can ignore it until the very end. > > */ > > data = *ptr++; > > #ifdef __LITTLE_ENDIAN > > data = (data >> shift) << shift; > > #else > > data = (data << shift) >> shift; > > #endif I suspect that #ifdef __LITTLE_ENDIAN data &= ~0ull << shift; #else data &= ~0ull >> shift; #endif is likely to be better. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 15/05/2019 11:15, David Laight wrote: > ... >>> ptr = (u64 *)(buff - offset); >>> shift = offset * 8; >>> >>> /* >>> * Head: zero out any excess leading bytes. Shifting back by the same >>> * amount should be at least as fast as any other way of handling the >>> * odd/even alignment, and means we can ignore it until the very end. >>> */ >>> data = *ptr++; >>> #ifdef __LITTLE_ENDIAN >>> data = (data >> shift) << shift; >>> #else >>> data = (data << shift) >> shift; >>> #endif > > I suspect that > #ifdef __LITTLE_ENDIAN > data &= ~0ull << shift; > #else > data &= ~0ull >> shift; > #endif > is likely to be better. Out of interest, better in which respects? For the A64 ISA at least, that would take 3 instructions plus an additional scratch register, e.g.: MOV x2, #~0 LSL x2, x2, x1 AND x0, x0, x1 (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that will typically take as many cycles if not more than just pipelining the two 'simple' ALU instructions) Whereas the original is just two shift instruction in-place. LSR x0, x0, x1 LSL x0, x0, x1 If the operation were repeated, the constant generation could certainly be amortised over multiple subsequent ANDs for a net win, but that isn't the case here. Robin.
On 15/05/2019 10:47, Will Deacon wrote: > On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: >> On 12/04/2019 10:52, Will Deacon wrote: >>> I'm waiting for Robin to come back with numbers for a C implementation. >>> >>> Robin -- did you get anywhere with that? >> >> Still not what I would call finished, but where I've got so far (besides an >> increasingly elaborate test rig) is as below - it still wants some unrolling >> in the middle to really fly (and actual testing on BE), but the worst-case >> performance already equals or just beats this asm version on Cortex-A53 with >> GCC 7 (by virtue of being alignment-insensitive and branchless except for >> the loop). Unfortunately, the advantage of C code being instrumentable does >> also come around to bite me... > > Is there any interest from anybody in spinning a proper patch out of this? > Shaokun? FWIW I've learned how to fix the KASAN thing now, so I'll try giving this some more love while I've got other outstanding optimisation stuff to look at anyway. Robin.
From: Robin Murphy > Sent: 15 May 2019 11:58 > To: David Laight; 'Will Deacon' > Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org; > ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com > Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version > > On 15/05/2019 11:15, David Laight wrote: > > ... > >>> ptr = (u64 *)(buff - offset); > >>> shift = offset * 8; > >>> > >>> /* > >>> * Head: zero out any excess leading bytes. Shifting back by the same > >>> * amount should be at least as fast as any other way of handling the > >>> * odd/even alignment, and means we can ignore it until the very end. > >>> */ > >>> data = *ptr++; > >>> #ifdef __LITTLE_ENDIAN > >>> data = (data >> shift) << shift; > >>> #else > >>> data = (data << shift) >> shift; > >>> #endif > > > > I suspect that > > #ifdef __LITTLE_ENDIAN > > data &= ~0ull << shift; > > #else > > data &= ~0ull >> shift; > > #endif > > is likely to be better. > > Out of interest, better in which respects? For the A64 ISA at least, > that would take 3 instructions plus an additional scratch register, e.g.: > > MOV x2, #~0 > LSL x2, x2, x1 > AND x0, x0, x1 > > (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that > will typically take as many cycles if not more than just pipelining the > two 'simple' ALU instructions) > > Whereas the original is just two shift instruction in-place. > > LSR x0, x0, x1 > LSL x0, x0, x1 > > If the operation were repeated, the constant generation could certainly > be amortised over multiple subsequent ANDs for a net win, but that isn't > the case here. On a superscaler processor you reduce the register dependency chain by one instruction. The original code is pretty much a single dependency chain so you are likely to be able to generate the mask 'for free'. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 15/05/2019 12:13, David Laight wrote: > From: Robin Murphy >> Sent: 15 May 2019 11:58 >> To: David Laight; 'Will Deacon' >> Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org; >> ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com >> Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version >> >> On 15/05/2019 11:15, David Laight wrote: >>> ... >>>>> ptr = (u64 *)(buff - offset); >>>>> shift = offset * 8; >>>>> >>>>> /* >>>>> * Head: zero out any excess leading bytes. Shifting back by the same >>>>> * amount should be at least as fast as any other way of handling the >>>>> * odd/even alignment, and means we can ignore it until the very end. >>>>> */ >>>>> data = *ptr++; >>>>> #ifdef __LITTLE_ENDIAN >>>>> data = (data >> shift) << shift; >>>>> #else >>>>> data = (data << shift) >> shift; >>>>> #endif >>> >>> I suspect that >>> #ifdef __LITTLE_ENDIAN >>> data &= ~0ull << shift; >>> #else >>> data &= ~0ull >> shift; >>> #endif >>> is likely to be better. >> >> Out of interest, better in which respects? For the A64 ISA at least, >> that would take 3 instructions plus an additional scratch register, e.g.: >> >> MOV x2, #~0 >> LSL x2, x2, x1 >> AND x0, x0, x1 [That should have been "AND x0, x1, x2", obviously...] >> >> (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that >> will typically take as many cycles if not more than just pipelining the >> two 'simple' ALU instructions) >> >> Whereas the original is just two shift instruction in-place. >> >> LSR x0, x0, x1 >> LSL x0, x0, x1 >> >> If the operation were repeated, the constant generation could certainly >> be amortised over multiple subsequent ANDs for a net win, but that isn't >> the case here. > > On a superscaler processor you reduce the register dependency > chain by one instruction. > The original code is pretty much a single dependency chain so > you are likely to be able to generate the mask 'for free'. Gotcha, although 'free' still means additional I$ and register rename footprint, vs. (typically) just 1 extra cycle to forward an ALU result. It's an interesting consideration, but in our case there are almost certainly far more little in-order cores out in the wild than big OoO ones, and the double-shift will always be objectively better for those. Thanks, Robin.
From: Robin Murphy > Sent: 15 May 2019 13:40 > On 15/05/2019 12:13, David Laight wrote: > > From: Robin Murphy > >> Sent: 15 May 2019 11:58 > >> To: David Laight; 'Will Deacon' > >> Cc: Zhangshaokun; Ard Biesheuvel; linux-arm-kernel@lists.infradead.org; netdev@vger.kernel.org; > >> ilias.apalodimas@linaro.org; huanglingyan (A); steve.capper@arm.com > >> Subject: Re: [PATCH] arm64: do_csum: implement accelerated scalar version > >> > >> On 15/05/2019 11:15, David Laight wrote: > >>> ... > >>>>> ptr = (u64 *)(buff - offset); > >>>>> shift = offset * 8; > >>>>> > >>>>> /* > >>>>> * Head: zero out any excess leading bytes. Shifting back by the same > >>>>> * amount should be at least as fast as any other way of handling the > >>>>> * odd/even alignment, and means we can ignore it until the very end. > >>>>> */ > >>>>> data = *ptr++; > >>>>> #ifdef __LITTLE_ENDIAN > >>>>> data = (data >> shift) << shift; > >>>>> #else > >>>>> data = (data << shift) >> shift; > >>>>> #endif > >>> > >>> I suspect that > >>> #ifdef __LITTLE_ENDIAN > >>> data &= ~0ull << shift; > >>> #else > >>> data &= ~0ull >> shift; > >>> #endif > >>> is likely to be better. > >> > >> Out of interest, better in which respects? For the A64 ISA at least, > >> that would take 3 instructions plus an additional scratch register, e.g.: > >> > >> MOV x2, #~0 > >> LSL x2, x2, x1 > >> AND x0, x0, x1 > > [That should have been "AND x0, x1, x2", obviously...] > > >> > >> (alternatively "AND x0, x0, x1 LSL x2" to save 4 bytes of code, but that > >> will typically take as many cycles if not more than just pipelining the > >> two 'simple' ALU instructions) > >> > >> Whereas the original is just two shift instruction in-place. > >> > >> LSR x0, x0, x1 > >> LSL x0, x0, x1 > >> > >> If the operation were repeated, the constant generation could certainly > >> be amortised over multiple subsequent ANDs for a net win, but that isn't > >> the case here. > > > > On a superscaler processor you reduce the register dependency > > chain by one instruction. > > The original code is pretty much a single dependency chain so > > you are likely to be able to generate the mask 'for free'. > > Gotcha, although 'free' still means additional I$ and register rename > footprint, vs. (typically) just 1 extra cycle to forward an ALU result. > It's an interesting consideration, but in our case there are almost > certainly far more little in-order cores out in the wild than big OoO > ones, and the double-shift will always be objectively better for those. Is there a pipeline delay before the result of the memory read (*ptr) can be used? (Even assuming the data is in the L1 cache??) Even on an in-order cpu that can give you a spare cycle or two that the code may not normally fill. FWIW I've been known to use (++ptr)[-1] (instead of *ptr++) to move the increment into an available delay slot (of an earlier load). Anyway it isn't that obvious that it is the fastest way. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Hi Will, On 2019/5/15 17:47, Will Deacon wrote: > On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: >> On 12/04/2019 10:52, Will Deacon wrote: >>> I'm waiting for Robin to come back with numbers for a C implementation. >>> >>> Robin -- did you get anywhere with that? >> >> Still not what I would call finished, but where I've got so far (besides an >> increasingly elaborate test rig) is as below - it still wants some unrolling >> in the middle to really fly (and actual testing on BE), but the worst-case >> performance already equals or just beats this asm version on Cortex-A53 with >> GCC 7 (by virtue of being alignment-insensitive and branchless except for >> the loop). Unfortunately, the advantage of C code being instrumentable does >> also come around to bite me... > > Is there any interest from anybody in spinning a proper patch out of this? > Shaokun? HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and Robin are ok, Lingyan or I can try to do it. Of course, if any guy posts the patch, we are happy to test it. Any will be ok. Thanks, Shaokun > > Will > >> /* Looks dumb, but generates nice-ish code */ >> static u64 accumulate(u64 sum, u64 data) >> { >> __uint128_t tmp = (__uint128_t)sum + data; >> return tmp + (tmp >> 64); >> } >> >> unsigned int do_csum_c(const unsigned char *buff, int len) >> { >> unsigned int offset, shift, sum, count; >> u64 data, *ptr; >> u64 sum64 = 0; >> >> offset = (unsigned long)buff & 0x7; >> /* >> * This is to all intents and purposes safe, since rounding down cannot >> * result in a different page or cache line being accessed, and @buff >> * should absolutely not be pointing to anything read-sensitive. >> * It does, however, piss off KASAN... >> */ >> ptr = (u64 *)(buff - offset); >> shift = offset * 8; >> >> /* >> * Head: zero out any excess leading bytes. Shifting back by the same >> * amount should be at least as fast as any other way of handling the >> * odd/even alignment, and means we can ignore it until the very end. >> */ >> data = *ptr++; >> #ifdef __LITTLE_ENDIAN >> data = (data >> shift) << shift; >> #else >> data = (data << shift) >> shift; >> #endif >> count = 8 - offset; >> >> /* Body: straightforward aligned loads from here on... */ >> //TODO: fancy stuff with larger strides and uint128s? >> while(len > count) { >> sum64 = accumulate(sum64, data); >> data = *ptr++; >> count += 8; >> } >> /* >> * Tail: zero any over-read bytes similarly to the head, again >> * preserving odd/even alignment. >> */ >> shift = (count - len) * 8; >> #ifdef __LITTLE_ENDIAN >> data = (data << shift) >> shift; >> #else >> data = (data >> shift) << shift; >> #endif >> sum64 = accumulate(sum64, data); >> >> /* Finally, folding */ >> sum64 += (sum64 >> 32) | (sum64 << 32); >> sum = sum64 >> 32; >> sum += (sum >> 16) | (sum << 16); >> if (offset & 1) >> return (u16)swab32(sum); >> >> return sum >> 16; >> } > > . >
On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote: > On 2019/5/15 17:47, Will Deacon wrote: > > On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: > >> On 12/04/2019 10:52, Will Deacon wrote: > >>> I'm waiting for Robin to come back with numbers for a C implementation. > >>> > >>> Robin -- did you get anywhere with that? > >> > >> Still not what I would call finished, but where I've got so far (besides an > >> increasingly elaborate test rig) is as below - it still wants some unrolling > >> in the middle to really fly (and actual testing on BE), but the worst-case > >> performance already equals or just beats this asm version on Cortex-A53 with > >> GCC 7 (by virtue of being alignment-insensitive and branchless except for > >> the loop). Unfortunately, the advantage of C code being instrumentable does > >> also come around to bite me... > > > > Is there any interest from anybody in spinning a proper patch out of this? > > Shaokun? > > HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and > Robin are ok, Lingyan or I can try to do it. > Of course, if any guy posts the patch, we are happy to test it. > Any will be ok. I don't mind who posts it, but Robin is super busy with SMMU stuff at the moment so it probably makes more sense for you or Lingyan to do it. Will
Hi Will, On 2019/8/16 0:46, Will Deacon wrote: > On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote: >> On 2019/5/15 17:47, Will Deacon wrote: >>> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: >>>> On 12/04/2019 10:52, Will Deacon wrote: >>>>> I'm waiting for Robin to come back with numbers for a C implementation. >>>>> >>>>> Robin -- did you get anywhere with that? >>>> >>>> Still not what I would call finished, but where I've got so far (besides an >>>> increasingly elaborate test rig) is as below - it still wants some unrolling >>>> in the middle to really fly (and actual testing on BE), but the worst-case >>>> performance already equals or just beats this asm version on Cortex-A53 with >>>> GCC 7 (by virtue of being alignment-insensitive and branchless except for >>>> the loop). Unfortunately, the advantage of C code being instrumentable does >>>> also come around to bite me... >>> >>> Is there any interest from anybody in spinning a proper patch out of this? >>> Shaokun? >> >> HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and >> Robin are ok, Lingyan or I can try to do it. >> Of course, if any guy posts the patch, we are happy to test it. >> Any will be ok. > > I don't mind who posts it, but Robin is super busy with SMMU stuff at the > moment so it probably makes more sense for you or Lingyan to do it. Thanks for restarting this topic, I or Lingyan will do it soon. Thanks, Shaokun > > Will > > . >
On 16/08/2019 09:15, Shaokun Zhang wrote: > Hi Will, > > On 2019/8/16 0:46, Will Deacon wrote: >> On Thu, May 16, 2019 at 11:14:35AM +0800, Zhangshaokun wrote: >>> On 2019/5/15 17:47, Will Deacon wrote: >>>> On Mon, Apr 15, 2019 at 07:18:22PM +0100, Robin Murphy wrote: >>>>> On 12/04/2019 10:52, Will Deacon wrote: >>>>>> I'm waiting for Robin to come back with numbers for a C implementation. >>>>>> >>>>>> Robin -- did you get anywhere with that? >>>>> >>>>> Still not what I would call finished, but where I've got so far (besides an >>>>> increasingly elaborate test rig) is as below - it still wants some unrolling >>>>> in the middle to really fly (and actual testing on BE), but the worst-case >>>>> performance already equals or just beats this asm version on Cortex-A53 with >>>>> GCC 7 (by virtue of being alignment-insensitive and branchless except for >>>>> the loop). Unfortunately, the advantage of C code being instrumentable does >>>>> also come around to bite me... >>>> >>>> Is there any interest from anybody in spinning a proper patch out of this? >>>> Shaokun? >>> >>> HiSilicon's Kunpeng920(Hi1620) benefits from do_csum optimization, if Ard and >>> Robin are ok, Lingyan or I can try to do it. >>> Of course, if any guy posts the patch, we are happy to test it. >>> Any will be ok. >> >> I don't mind who posts it, but Robin is super busy with SMMU stuff at the >> moment so it probably makes more sense for you or Lingyan to do it. > > Thanks for restarting this topic, I or Lingyan will do it soon. FWIW, I've rolled up what I had so far and dumped it up into a quick semi-realistic patch here: http://linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=859c5566510c32ae72039aa5072e932a771a3596 So far I'd put most of the effort into the aforementioned benchmarking harness to compare performance and correctness for all the proposed implementations over all reasonable alignment/length combinations - I think that got pretty much finished, but as Will says I'm unlikely to find time to properly look at this again for several weeks. Robin.
diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 0b6f5a7d4027..e906b956c1fc 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } #define ip_fast_csum ip_fast_csum +extern unsigned int do_csum(const unsigned char *buff, int len); +#define do_csum do_csum + #include <asm-generic/checksum.h> #endif /* __ASM_CHECKSUM_H */ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 5540a1638baf..a7606007a749 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ - strchr.o strrchr.o tishift.o + strchr.o strrchr.o tishift.o csum.o ifeq ($(CONFIG_KERNEL_MODE_NEON), y) obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S new file mode 100644 index 000000000000..534e2ebdc426 --- /dev/null +++ b/arch/arm64/lib/csum.S @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +ENTRY(do_csum) + adds x2, xzr, xzr // clear x2 and C flag + + // 64 bytes at a time + lsr x3, x1, #6 + and x1, x1, #63 + cbz x3, 1f + + // Eight 64-bit adds per iteration +0: ldp x4, x5, [x0], #64 + ldp x6, x7, [x0, #-48] + ldp x8, x9, [x0, #-32] + ldp x10, x11, [x0, #-16] + adcs x2, x2, x4 + sub x3, x3, #1 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adcs x2, x2, x8 + adcs x2, x2, x9 + adcs x2, x2, x10 + adcs x2, x2, x11 + cbnz x3, 0b + adc x2, x2, xzr + + cbz x1, 7f + bic x3, x1, #1 + add x12, x0, x1 + add x0, x0, x3 + neg x3, x3 + add x3, x3, #64 + lsl x3, x3, #3 + + // Handle remaining 63 bytes or less using an overlapping 64-byte load + // and a branchless code path to complete the calculation + ldp x4, x5, [x0, #-64] + ldp x6, x7, [x0, #-48] + ldp x8, x9, [x0, #-32] + ldp x10, x11, [x0, #-16] + ldrb w12, [x12, #-1] + + .irp reg, x4, x5, x6, x7, x8, x9, x10, x11 + cmp x3, #64 + csel \reg, \reg, xzr, lt + ccmp x3, xzr, #0, lt + csel x13, x3, xzr, gt + sub x3, x3, #64 +CPU_LE( lsr \reg, \reg, x13 ) +CPU_BE( lsl \reg, \reg, x13 ) + .endr + + adds x2, x2, x4 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adcs x2, x2, x8 + adcs x2, x2, x9 + adcs x2, x2, x10 + adcs x2, x2, x11 + adc x2, x2, xzr + +CPU_LE( adds x12, x2, x12 ) +CPU_BE( adds x12, x2, x12, lsl #8 ) + adc x12, x12, xzr + tst x1, #1 + csel x2, x2, x12, eq + +7: lsr x1, x2, #32 + adds w2, w2, w1 + adc w2, w2, wzr + + lsr w1, w2, #16 + uxth w2, w2 + add w2, w2, w1 + + lsr w1, w2, #16 // handle the carry by hand + add w2, w2, w1 + + uxth w0, w2 + ret + + // Handle 63 bytes or less +1: tbz x1, #5, 2f + ldp x4, x5, [x0], #32 + ldp x6, x7, [x0, #-16] + adds x2, x2, x4 + adcs x2, x2, x5 + adcs x2, x2, x6 + adcs x2, x2, x7 + adc x2, x2, xzr + +2: tbz x1, #4, 3f + ldp x4, x5, [x0], #16 + adds x2, x2, x4 + adcs x2, x2, x5 + adc x2, x2, xzr + +3: tbz x1, #3, 4f + ldr x4, [x0], #8 + adds x2, x2, x4 + adc x2, x2, xzr + +4: tbz x1, #2, 5f + ldr w4, [x0], #4 + adds x2, x2, x4 + adc x2, x2, xzr + +5: tbz x1, #1, 6f + ldrh w4, [x0], #2 + adds x2, x2, x4 + adc x2, x2, xzr + +6: tbz x1, #0, 7b + ldrb w4, [x0] +CPU_LE( adds x2, x2, x4 ) +CPU_BE( adds x2, x2, x4, lsl #8 ) + adc x2, x2, xzr + b 7b +ENDPROC(do_csum)
It turns out that the IP checksumming code is still exercised often, even though one might expect that modern NICs with checksum offload have no use for it. However, as Lingyan points out, there are combinations of features where the network stack may still fall back to software checksumming, and so it makes sense to provide an optimized implementation in software as well. So provide an implementation of do_csum() in scalar assembler, which, unlike C, gives direct access to the carry flag, making the code run substantially faster. The routine uses overlapping 64 byte loads for all input size > 64 bytes, in order to reduce the number of branches and improve performance on cores with deep pipelines. On Cortex-A57, this implementation is on par with Lingyan's NEON implementation, and roughly 7x as fast as the generic C code. Cc: "huanglingyan (A)" <huanglingyan2@huawei.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> --- Test code after the patch. arch/arm64/include/asm/checksum.h | 3 + arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/csum.S | 127 ++++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-)