Message ID | 1546518769-114424-1-git-send-email-huanglingyan2@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] arm64: lib: accelerate do_csum with NEON instruction | expand |
On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote: > > Function do_csum() in lib/checksum.c is used to compute checksum, > which is turned out to be slowly and costs a lot of resources. > Let's use neon instructions to accelerate the checksum computation > for arm64. > > ------ > V1 ==> V2: > Change NEON assembly code to NEON intrinsic code which is built > on top of arm_neon.h to avoid dropping into assembly. > ------ > > Here is the comparison results of function ip_compute_csum() between > general do_csum() and neon instruction do_csum(). > > len(1000cycle) general(ns) do_csum_neon(ns) > 64B: 58060 59460 > 128B: 82930 83930 > 256B: 132480 73570 > 512B: 230100 86230 > 1024B: 426600 98200 > Very nice! Which CPU did you test this on? > > --- > arch/arm64/lib/Makefile | 4 ++ > arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++ > include/asm-generic/checksum.h | 1 + > lib/checksum.c | 8 ++- > 4 files changed, 152 insertions(+), 1 deletion(-) > create mode 100644 arch/arm64/lib/checksum.c > > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile > index 5540a16..ec2fcd3 100644 > --- a/arch/arm64/lib/Makefile > +++ b/arch/arm64/lib/Makefile > @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y) > obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o > CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only > CFLAGS_xor-neon.o += -ffreestanding > + > +obj-y += checksum.o > +CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only > +CFLAGS_checksum.o += -ffreestanding > endif > > # Tell the compiler to treat all general purpose registers (with the > diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c > new file mode 100644 > index 0000000..48f4ead > --- /dev/null > +++ b/arch/arm64/lib/checksum.c > @@ -0,0 +1,140 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * arch/arm64/lib/checksum.c Drop this line > + * > + * Authors: Lingyan Huang <huanglingyan2@huawei.com> > + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved. > + * > + * Generic C or neon implementation of do_csum operations. > + * Choose faster neon instructions when NEON is supported. > + * > + */ > + > +#include <asm/neon.h> > +#include <asm/simd.h> > +#include <asm/checksum.h> > +#include <asm/byteorder.h> > +#include <asm/neon-intrinsics.h> > + > +#define CSUM_NEON_THRESHOLD 128 > +#ifdef CONFIG_KERNEL_MODE_NEON > +static inline u32 from64to16(u64 x) > +{ > + /* add up 32-bit and 32-bit for 32+c bit */ > + x = (x & 0xffffffff) + (x >> 32); > + /* add up carry.. */ > + x = (x & 0xffffffff) + (x >> 32); > + /* add up 16-bit and 16-bit for 16+c bit */ > + x = ((u32)x & 0xffff) + ((u32)x >> 16); > + /* add up carry.. */ > + x = ((u32)x & 0xffff) + ((u32)x >> 16); > + return x; > +} > + > +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len) > +{ > + unsigned int odd, count; > + uint64_t result = 0; > + unsigned int count64; > + uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0}; > + > + register uint32x4_t v0, v1, v2, v3; > + > + if (unlikely(len <= 0)) len is unsigned so '== 0' is sufficient > + return result; > + > + odd = 1 & (unsigned long) buff; No space after () cast please > + if (unlikely(odd)) { > + result = *buff; > + len--; > + buff++; > + } > + > + count = len >> 1; > + if (count) { > + if (2 & (unsigned long) buff) { > + result += *(unsigned short *)buff; > + count--; > + len -= 2; > + buff += 2; > + } > + count >>= 1; /* nr of 32-bit words.. */ > + if (count) { > + if (4 & (unsigned long) buff) { > + result += *(unsigned int *) buff; > + count--; > + len -= 4; > + buff += 4; > + } > + count >>= 1; /* nr of 64-bit words.. */ > + > + v0 = vzero; > + v1 = vzero; > + v2 = vzero; > + v3 = vzero; > + > + count64 = count >> 3; /* compute 64 Byte circle */ > + while (count64) { > + v0 = vpadalq_u16(v0, > + vld1q_u16((uint16_t *)buff + 0)); > + v1 = vpadalq_u16(v1, > + vld1q_u16((uint16_t *)buff + 8)); > + v2 = vpadalq_u16(v2, > + vld1q_u16((uint16_t *)buff + 16)); > + v3 = vpadalq_u16(v3, > + vld1q_u16((uint16_t *)buff + 24)); > + buff += 64; > + count64--; > + } > + v0 = vaddq_u32(v0, v1); > + v2 = vaddq_u32(v2, v3); > + v0 = vaddq_u32(v0, v2); > + > + count %= 8; > + while (count >= 2) { /* compute 16 byte circle */ > + v0 = vpadalq_u16(v0, > + vld1q_u16((uint16_t *)buff + 0)); > + buff += 16; > + count -= 2; > + } > + > + result += vgetq_lane_u32(v0, 0); > + result += vgetq_lane_u32(v0, 1); > + result += vgetq_lane_u32(v0, 2); > + result += vgetq_lane_u32(v0, 3); > + if (count & 1) { > + result += *(unsigned long long *) buff; > + buff += 8; > + } > + if (len & 4) { > + result += *(unsigned int *) buff; > + buff += 4; > + } > + } > + if (len & 2) { > + result += *(unsigned short *) buff; > + buff += 2; > + } > + } > + if (len & 1) > + result += *buff; > + result = from64to16(result); > + if (odd) > + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); > + return result; > +} > +#endif > + > + > +unsigned int do_csum(const unsigned char *buff, unsigned int len) > +{ > + if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) { > + unsigned int res; > + > + kernel_neon_begin(); > + res = do_csum_neon(buff, len); > + kernel_neon_end(); > + return res; > + } else > + return do_csum_generic(buff, len); > +} > diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h > index 34785c0..041063c 100644 > --- a/include/asm-generic/checksum.h > +++ b/include/asm-generic/checksum.h > @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum) > */ > extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, > int len, __wsum sum, int *csum_err); > +extern unsigned int do_csum_generic(const unsigned char *buff, int len); > Surely, we can find a better way to hook up this code than modifying headers under asm-generic and code in lib/checksum.c Is arm64 the only arch that has an optimized checksum() function? > #ifndef csum_partial_copy_nocheck > #define csum_partial_copy_nocheck(src, dst, len, sum) \ > diff --git a/lib/checksum.c b/lib/checksum.c > index d3ec93f..83392db 100644 > --- a/lib/checksum.c > +++ b/lib/checksum.c > @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x) > return x; > } > > -static unsigned int do_csum(const unsigned char *buff, int len) > +unsigned int do_csum_generic(const unsigned char *buff, int len) > { > int odd; > unsigned int result = 0; > @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len) > out: > return result; > } > + > +static unsigned int do_csum(const unsigned char *buff, int len) > +{ > + return do_csum_generic(buff, len); > +} > + > #endif > > #ifndef ip_fast_csum > -- > 2.7.4 > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
在 2019/1/4 2:19, Ard Biesheuvel 写道: > On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote: >> Function do_csum() in lib/checksum.c is used to compute checksum, >> which is turned out to be slowly and costs a lot of resources. >> Let's use neon instructions to accelerate the checksum computation >> for arm64. >> >> ------ >> V1 ==> V2: >> Change NEON assembly code to NEON intrinsic code which is built >> on top of arm_neon.h to avoid dropping into assembly. >> ------ >> >> Here is the comparison results of function ip_compute_csum() between >> general do_csum() and neon instruction do_csum(). >> >> len(1000cycle) general(ns) do_csum_neon(ns) >> 64B: 58060 59460 >> 128B: 82930 83930 >> 256B: 132480 73570 >> 512B: 230100 86230 >> 1024B: 426600 98200 >> > Very nice! Which CPU did you test this on? Thank you for your reply. The test platform is Huawei hip08. >> --- >> arch/arm64/lib/Makefile | 4 ++ >> arch/arm64/lib/checksum.c | 140 +++++++++++++++++++++++++++++++++++++++++ >> include/asm-generic/checksum.h | 1 + >> lib/checksum.c | 8 ++- >> 4 files changed, 152 insertions(+), 1 deletion(-) >> create mode 100644 arch/arm64/lib/checksum.c >> >> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile >> index 5540a16..ec2fcd3 100644 >> --- a/arch/arm64/lib/Makefile >> +++ b/arch/arm64/lib/Makefile >> @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y) >> obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o >> CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only >> CFLAGS_xor-neon.o += -ffreestanding >> + >> +obj-y += checksum.o >> +CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only >> +CFLAGS_checksum.o += -ffreestanding >> endif >> >> # Tell the compiler to treat all general purpose registers (with the >> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c >> new file mode 100644 >> index 0000000..48f4ead >> --- /dev/null >> +++ b/arch/arm64/lib/checksum.c >> @@ -0,0 +1,140 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * arch/arm64/lib/checksum.c > Drop this line OK. >> + * >> + * Authors: Lingyan Huang <huanglingyan2@huawei.com> >> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved. >> + * >> + * Generic C or neon implementation of do_csum operations. >> + * Choose faster neon instructions when NEON is supported. >> + * >> + */ >> + >> +#include <asm/neon.h> >> +#include <asm/simd.h> >> +#include <asm/checksum.h> >> +#include <asm/byteorder.h> >> +#include <asm/neon-intrinsics.h> >> + >> +#define CSUM_NEON_THRESHOLD 128 >> +#ifdef CONFIG_KERNEL_MODE_NEON >> +static inline u32 from64to16(u64 x) >> +{ >> + /* add up 32-bit and 32-bit for 32+c bit */ >> + x = (x & 0xffffffff) + (x >> 32); >> + /* add up carry.. */ >> + x = (x & 0xffffffff) + (x >> 32); >> + /* add up 16-bit and 16-bit for 16+c bit */ >> + x = ((u32)x & 0xffff) + ((u32)x >> 16); >> + /* add up carry.. */ >> + x = ((u32)x & 0xffff) + ((u32)x >> 16); >> + return x; >> +} >> + >> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len) >> +{ >> + unsigned int odd, count; >> + uint64_t result = 0; >> + unsigned int count64; >> + uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0}; >> + >> + register uint32x4_t v0, v1, v2, v3; >> + >> + if (unlikely(len <= 0)) > len is unsigned so '== 0' is sufficient OK >> + return result; >> + >> + odd = 1 & (unsigned long) buff; > No space after () cast please Has modified all such situations in V3 patch. >> + if (unlikely(odd)) { >> + result = *buff; >> + len--; >> + buff++; >> + } >> + >> + count = len >> 1; >> + if (count) { >> + if (2 & (unsigned long) buff) { >> + result += *(unsigned short *)buff; >> + count--; >> + len -= 2; >> + buff += 2; >> + } >> + count >>= 1; /* nr of 32-bit words.. */ >> + if (count) { >> + if (4 & (unsigned long) buff) { >> + result += *(unsigned int *) buff; >> + count--; >> + len -= 4; >> + buff += 4; >> + } >> + count >>= 1; /* nr of 64-bit words.. */ >> + >> + v0 = vzero; >> + v1 = vzero; >> + v2 = vzero; >> + v3 = vzero; >> + >> + count64 = count >> 3; /* compute 64 Byte circle */ >> + while (count64) { >> + v0 = vpadalq_u16(v0, >> + vld1q_u16((uint16_t *)buff + 0)); >> + v1 = vpadalq_u16(v1, >> + vld1q_u16((uint16_t *)buff + 8)); >> + v2 = vpadalq_u16(v2, >> + vld1q_u16((uint16_t *)buff + 16)); >> + v3 = vpadalq_u16(v3, >> + vld1q_u16((uint16_t *)buff + 24)); >> + buff += 64; >> + count64--; >> + } >> + v0 = vaddq_u32(v0, v1); >> + v2 = vaddq_u32(v2, v3); >> + v0 = vaddq_u32(v0, v2); >> + >> + count %= 8; >> + while (count >= 2) { /* compute 16 byte circle */ >> + v0 = vpadalq_u16(v0, >> + vld1q_u16((uint16_t *)buff + 0)); >> + buff += 16; >> + count -= 2; >> + } >> + >> + result += vgetq_lane_u32(v0, 0); >> + result += vgetq_lane_u32(v0, 1); >> + result += vgetq_lane_u32(v0, 2); >> + result += vgetq_lane_u32(v0, 3); >> + if (count & 1) { >> + result += *(unsigned long long *) buff; >> + buff += 8; >> + } >> + if (len & 4) { >> + result += *(unsigned int *) buff; >> + buff += 4; >> + } >> + } >> + if (len & 2) { >> + result += *(unsigned short *) buff; >> + buff += 2; >> + } >> + } >> + if (len & 1) >> + result += *buff; >> + result = from64to16(result); >> + if (odd) >> + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); >> + return result; >> +} >> +#endif >> + >> + >> +unsigned int do_csum(const unsigned char *buff, unsigned int len) >> +{ >> + if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) { >> + unsigned int res; >> + >> + kernel_neon_begin(); >> + res = do_csum_neon(buff, len); >> + kernel_neon_end(); >> + return res; >> + } else >> + return do_csum_generic(buff, len); >> +} >> diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h >> index 34785c0..041063c 100644 >> --- a/include/asm-generic/checksum.h >> +++ b/include/asm-generic/checksum.h >> @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum) >> */ >> extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, >> int len, __wsum sum, int *csum_err); >> +extern unsigned int do_csum_generic(const unsigned char *buff, int len); >> > Surely, we can find a better way to hook up this code than modifying > headers under asm-generic and code in lib/checksum.c > > Is arm64 the only arch that has an optimized checksum() function? Arm64 is not the only arch. Your suggetion is reasonable so I'm working at looking for a better way to only modify the arm64 codes. >> #ifndef csum_partial_copy_nocheck >> #define csum_partial_copy_nocheck(src, dst, len, sum) \ >> diff --git a/lib/checksum.c b/lib/checksum.c >> index d3ec93f..83392db 100644 >> --- a/lib/checksum.c >> +++ b/lib/checksum.c >> @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x) >> return x; >> } >> >> -static unsigned int do_csum(const unsigned char *buff, int len) >> +unsigned int do_csum_generic(const unsigned char *buff, int len) >> { >> int odd; >> unsigned int result = 0; >> @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len) >> out: >> return result; >> } >> + >> +static unsigned int do_csum(const unsigned char *buff, int len) >> +{ >> + return do_csum_generic(buff, len); >> +} >> + >> #endif >> >> #ifndef ip_fast_csum >> -- >> 2.7.4 >> >> >> _______________________________________________ >> linux-arm-kernel mailing list >> linux-arm-kernel@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > . >
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 5540a16..ec2fcd3 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y) obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only CFLAGS_xor-neon.o += -ffreestanding + +obj-y += checksum.o +CFLAGS_REMOVE_checksum.o += -mgeneral-regs-only +CFLAGS_checksum.o += -ffreestanding endif # Tell the compiler to treat all general purpose registers (with the diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c new file mode 100644 index 0000000..48f4ead --- /dev/null +++ b/arch/arm64/lib/checksum.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch/arm64/lib/checksum.c + * + * Authors: Lingyan Huang <huanglingyan2@huawei.com> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved. + * + * Generic C or neon implementation of do_csum operations. + * Choose faster neon instructions when NEON is supported. + * + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/checksum.h> +#include <asm/byteorder.h> +#include <asm/neon-intrinsics.h> + +#define CSUM_NEON_THRESHOLD 128 +#ifdef CONFIG_KERNEL_MODE_NEON +static inline u32 from64to16(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + /* add up 16-bit and 16-bit for 16+c bit */ + x = ((u32)x & 0xffff) + ((u32)x >> 16); + /* add up carry.. */ + x = ((u32)x & 0xffff) + ((u32)x >> 16); + return x; +} + +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len) +{ + unsigned int odd, count; + uint64_t result = 0; + unsigned int count64; + uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0}; + + register uint32x4_t v0, v1, v2, v3; + + if (unlikely(len <= 0)) + return result; + + odd = 1 & (unsigned long) buff; + if (unlikely(odd)) { + result = *buff; + len--; + buff++; + } + + count = len >> 1; + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + + v0 = vzero; + v1 = vzero; + v2 = vzero; + v3 = vzero; + + count64 = count >> 3; /* compute 64 Byte circle */ + while (count64) { + v0 = vpadalq_u16(v0, + vld1q_u16((uint16_t *)buff + 0)); + v1 = vpadalq_u16(v1, + vld1q_u16((uint16_t *)buff + 8)); + v2 = vpadalq_u16(v2, + vld1q_u16((uint16_t *)buff + 16)); + v3 = vpadalq_u16(v3, + vld1q_u16((uint16_t *)buff + 24)); + buff += 64; + count64--; + } + v0 = vaddq_u32(v0, v1); + v2 = vaddq_u32(v2, v3); + v0 = vaddq_u32(v0, v2); + + count %= 8; + while (count >= 2) { /* compute 16 byte circle */ + v0 = vpadalq_u16(v0, + vld1q_u16((uint16_t *)buff + 0)); + buff += 16; + count -= 2; + } + + result += vgetq_lane_u32(v0, 0); + result += vgetq_lane_u32(v0, 1); + result += vgetq_lane_u32(v0, 2); + result += vgetq_lane_u32(v0, 3); + if (count & 1) { + result += *(unsigned long long *) buff; + buff += 8; + } + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = from64to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + return result; +} +#endif + + +unsigned int do_csum(const unsigned char *buff, unsigned int len) +{ + if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) { + unsigned int res; + + kernel_neon_begin(); + res = do_csum_neon(buff, len); + kernel_neon_end(); + return res; + } else + return do_csum_generic(buff, len); +} diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h index 34785c0..041063c 100644 --- a/include/asm-generic/checksum.h +++ b/include/asm-generic/checksum.h @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum) */ extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *csum_err); +extern unsigned int do_csum_generic(const unsigned char *buff, int len); #ifndef csum_partial_copy_nocheck #define csum_partial_copy_nocheck(src, dst, len, sum) \ diff --git a/lib/checksum.c b/lib/checksum.c index d3ec93f..83392db 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x) return x; } -static unsigned int do_csum(const unsigned char *buff, int len) +unsigned int do_csum_generic(const unsigned char *buff, int len) { int odd; unsigned int result = 0; @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len) out: return result; } + +static unsigned int do_csum(const unsigned char *buff, int len) +{ + return do_csum_generic(buff, len); +} + #endif #ifndef ip_fast_csum