arm64: support do_csum with neon

Message ID	1515757996-4675-1-git-send-email-chenzhou10@huawei.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org> From: Chen Zhou <chenzhou10@huawei.com> To: <catalin.marinas@arm.com>, <will.deacon@arm.com> Subject: [PATCH] arm64: support do_csum with neon Date: Fri, 12 Jan 2018 19:53:16 +0800 Message-ID: <1515757996-4675-1-git-send-email-chenzhou10@huawei.com> MIME-Version: 1.0 Precedence: list Cc: linux-arm-kernel@lists.infradead.org Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

Message ID

1515757996-4675-1-git-send-email-chenzhou10@huawei.com (mailing list archive)

State

New, archived

Headers

From: Chen Zhou <chenzhou10@huawei.com>
To: <catalin.marinas@arm.com>, <will.deacon@arm.com>
Subject: [PATCH] arm64: support do_csum with neon
Date: Fri, 12 Jan 2018 19:53:16 +0800
Message-ID: <1515757996-4675-1-git-send-email-chenzhou10@huawei.com>
MIME-Version: 1.0
Precedence: list
Cc: linux-arm-kernel@lists.infradead.org
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org>
Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

Commit Message

chenzhou Jan. 12, 2018, 11:53 a.m. UTC

On arm64 little endian such as the Cortex-A57, the
neon based implementation performance increases by
about 70% when len is greater than 512.

Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
---
 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile           |   1 +
 arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 181 insertions(+)
 create mode 100644 arch/arm64/lib/do_csum.S

Comments

Robin Murphy Jan. 12, 2018, 12:22 p.m. UTC | #1

On 12/01/18 11:53, Chen Zhou wrote:
> On arm64 little endian such as the Cortex-A57, the
> neon based implementation performance increases by
> about 70% when len is greater than 512.

Um, I don't see the kernel-mode NEON infrastructure being used anywhere 
here. Blindly destroying someone else's register context is never going 
to end well, regardless of how fast you can do it...

Robin.

> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> ---
>   arch/arm64/include/asm/checksum.h |   3 +
>   arch/arm64/lib/Makefile           |   1 +
>   arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
>   3 files changed, 181 insertions(+)
>   create mode 100644 arch/arm64/lib/do_csum.S
> 
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 09f6533..e300782 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
>   }
>   #define csum_fold csum_fold
>   
> +#define do_csum do_csum
> +extern unsigned int do_csum(const unsigned char *, size_t);
> +
>   static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>   {
>   	__uint128_t tmp;
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 1a811ec..5b6aa34 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
>   		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
>   		   strchr.o strrchr.o
> +lib-y += do_csum.o
>   
>   # Tell the compiler to treat all general purpose registers as
>   # callee-saved, which allows for efficient runtime patching of the bl
> diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
> new file mode 100644
> index 0000000..8e7b486
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.S
> @@ -0,0 +1,177 @@
> +/*
> + * Optmized version of the standard do_csum() function
> + *
> + * Parameters:
> + *	x0 - address of buffer to checksum (const unsigned char *)
> + *	x1 - length of the buffer (int)
> + * Returns:
> + *	x0 - the return checksum of the buffer
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +ENTRY(do_csum)
> +	ldr	x13, =0xffff
> +	eor	x4, x4, x4
> +	eor	x5, x5, x5
> +	eor	v0.16b, v0.16b, v0.16b
> +
> +	//len is zero or negative
> +	and	x6, x1, #0x80000000
> +	cmp	x6, #0
> +	b.gt	out
> +	cbz	w1, out
> +
> +	tst	x0, #1
> +	b.eq	addr_not_odd
> +
> +	//addr is odd
> +	mov	x4, #1
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	and     x6, x6, #0xff
> +#else
> +	lsl   x6, x6, #8
> +	and   x6, x6, x13
> +#endif
> +	add     x5, x5, x6
> +	sub     x1, x1, #1
> +
> +addr_not_odd:
> +	cmp	x1, #32
> +	b.lt	len_4
> +	cmp	x1, #192
> +	b.ge	len_than_192
> +	b	do_loop_16
> +
> +len_than_192:
> +	ldp	q1, q0, [x0], #32
> +	ldp	q3, q2, [x0], #32
> +	ldp	q5, q4, [x0], #32
> +	sub	x1, x1, #96
> +
> +do_loop_96:
> +	ldp	q7, q6, [x0], #32
> +	ldp	q9, q8, [x0], #32
> +	ldp	q11, q10, [x0], #32
> +
> +	uaddl	v12.4s, v0.4h, v6.4h
> +	uaddl2	v13.4s, v0.8h, v6.8h
> +
> +	uaddl	v14.4s, v1.4h, v7.4h
> +	uaddl2	v15.4s, v1.8h, v7.8h
> +
> +	uaddl	v16.4s, v2.4h, v8.4h
> +	uaddl2	v17.4s, v2.8h, v8.8h
> +
> +	uaddl	v18.4s, v3.4h, v9.4h
> +	uaddl2	v19.4s, v3.8h, v9.8h
> +
> +	uaddl	v20.4s, v4.4h, v10.4h
> +	uaddl2	v21.4s, v4.8h, v10.8h
> +	uaddl	v22.4s, v5.4h, v11.4h
> +	uaddl2	v23.4s, v5.8h, v11.8h
> +
> +	add	v0.4s, v12.4s, v13.4s
> +	add	v1.4s, v14.4s, v15.4s
> +	add	v2.4s, v16.4s, v17.4s
> +	add	v3.4s, v18.4s, v19.4s
> +	add	v4.4s, v20.4s, v21.4s
> +	add	v5.4s, v22.4s, v23.4s
> +
> +	sub	x1, x1, #96
> +	cmp	x1, #96
> +	b.ge	do_loop_96
> +
> +	add	v0.4s, v0.4s, v1.4s
> +	add	v2.4s, v2.4s, v3.4s
> +	add	v4.4s, v4.4s, v5.4s
> +	add	v0.4s, v0.4s, v2.4s
> +	add	v0.4s, v0.4s, v4.4s     //get result
> +
> +	cmp	x1, #16
> +	b.lt	get_64
> +
> +do_loop_16:
> +	ldr	q6, [x0], #16
> +	uaddl	v24.4s, v0.4h, v6.4h
> +	uaddl2	v25.4s, v0.8h, v6.8h
> +	add	v0.4s, v24.4s, v25.4s
> +	sub	x1, x1, #16
> +	cmp	x1, #16
> +	b.ge	do_loop_16
> +
> +get_64:
> +	mov	x6, v0.d[0]
> +	add	x5, x5, x6
> +	mov	x6, v0.d[1]
> +
> +	add	x5, x5, x6
> +	cmp	x5, x6
> +	b.ge	len_4
> +	add	x5, x5, #1
> +
> +len_4:
> +	cmp	x1, #4
> +	b.lt	len_2
> +
> +	sub	x1, x1, #4
> +	ldr	w6, [x0], #4
> +	and	x6, x6, #0xffffffff
> +	add	x5, x5, x6
> +	b	len_4
> +
> +len_2:
> +	cmp	x1, #2
> +	b.lt	len_1
> +	sub	x1, x1, #2
> +	ldrh	w6, [x0], #2
> +	and	x6, x6, x13
> +	add	x5, x5, x6
> +
> +len_1:
> +	cmp	x1, #1
> +	b.lt	fold_32
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	lsl	x6, x6, #8
> +	and	x6, x6, x13
> +#else
> +	and	x6, x6, #0xff
> +#endif
> +	add	x5, x5, x6
> +
> +fold_32:
> +	and	x9, x5, x13		//[15:0]
> +	and	x10, x13, x5, lsr #16	//[31:16]
> +	and	x11, x13, x5, lsr #32	//[47:32]
> +	and	x12, x13, x5, lsr #48	//[47:32]
> +
> +	add	x9, x9, x10
> +	add	x11, x11, x12
> +
> +	add	x9, x9, x11
> +
> +	and	x10, x9, x13
> +	and	x11, x13, x9, lsr #16
> +
> +	add	x5, x10, x11
> +
> +	and     x9, x5, x13             //add carry
> +	and     x10, x13, x5, lsr #16
> +	add	x5, x9, x10
> +
> +	cbz	x4, out			//addr isn't odd
> +
> +	lsr	x6, x5, #8
> +	and	x6, x6, #0xff
> +	and	x7, x5, #0xff
> +	lsl	x7, x7, #8
> +
> +	orr	x5, x6, x7
> +
> +out:
> +	mov	x0, x5
> +	ret
> +ENDPROC(do_csum)
>

chenzhou Jan. 16, 2018, 1:07 a.m. UTC | #2

Hi Robin,

-----Original Message-----
From: Robin Murphy [mailto:robin.murphy@arm.com] 
Sent: Friday, January 12, 2018 8:23 PM
To: chenzhou; catalin.marinas@arm.com; will.deacon@arm.com
Cc: linux-arm-kernel@lists.infradead.org
Subject: Re: [PATCH] arm64: support do_csum with neon

On 12/01/18 11:53, Chen Zhou wrote:
> On arm64 little endian such as the Cortex-A57, the neon based 
> implementation performance increases by about 70% when len is greater 
> than 512.

Um, I don't see the kernel-mode NEON infrastructure being used anywhere here. Blindly destroying someone else's register context is never going to end well, regardless of how fast you can do it...

Robin.

Thank you very much for your review. You're right. I don't think about whether the
systems support NEON and don't put kernel_neon_begin and kernel_neon_end
calls around NEON code. I will fix this up later.

Thanks
Chen Zhou

> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> ---
>   arch/arm64/include/asm/checksum.h |   3 +
>   arch/arm64/lib/Makefile           |   1 +
>   arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
>   3 files changed, 181 insertions(+)
>   create mode 100644 arch/arm64/lib/do_csum.S
> 
> diff --git a/arch/arm64/include/asm/checksum.h 
> b/arch/arm64/include/asm/checksum.h
> index 09f6533..e300782 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
>   }
>   #define csum_fold csum_fold
>   
> +#define do_csum do_csum
> +extern unsigned int do_csum(const unsigned char *, size_t);
> +
>   static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>   {
>   	__uint128_t tmp;
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 
> 1a811ec..5b6aa34 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
>   		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
>   		   strchr.o strrchr.o
> +lib-y += do_csum.o
>   
>   # Tell the compiler to treat all general purpose registers as
>   # callee-saved, which allows for efficient runtime patching of the 
> bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S 
> new file mode 100644 index 0000000..8e7b486
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.S
> @@ -0,0 +1,177 @@
> +/*
> + * Optmized version of the standard do_csum() function
> + *
> + * Parameters:
> + *	x0 - address of buffer to checksum (const unsigned char *)
> + *	x1 - length of the buffer (int)
> + * Returns:
> + *	x0 - the return checksum of the buffer
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +ENTRY(do_csum)
> +	ldr	x13, =0xffff
> +	eor	x4, x4, x4
> +	eor	x5, x5, x5
> +	eor	v0.16b, v0.16b, v0.16b
> +
> +	//len is zero or negative
> +	and	x6, x1, #0x80000000
> +	cmp	x6, #0
> +	b.gt	out
> +	cbz	w1, out
> +
> +	tst	x0, #1
> +	b.eq	addr_not_odd
> +
> +	//addr is odd
> +	mov	x4, #1
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	and     x6, x6, #0xff
> +#else
> +	lsl   x6, x6, #8
> +	and   x6, x6, x13
> +#endif
> +	add     x5, x5, x6
> +	sub     x1, x1, #1
> +
> +addr_not_odd:
> +	cmp	x1, #32
> +	b.lt	len_4
> +	cmp	x1, #192
> +	b.ge	len_than_192
> +	b	do_loop_16
> +
> +len_than_192:
> +	ldp	q1, q0, [x0], #32
> +	ldp	q3, q2, [x0], #32
> +	ldp	q5, q4, [x0], #32
> +	sub	x1, x1, #96
> +
> +do_loop_96:
> +	ldp	q7, q6, [x0], #32
> +	ldp	q9, q8, [x0], #32
> +	ldp	q11, q10, [x0], #32
> +
> +	uaddl	v12.4s, v0.4h, v6.4h
> +	uaddl2	v13.4s, v0.8h, v6.8h
> +
> +	uaddl	v14.4s, v1.4h, v7.4h
> +	uaddl2	v15.4s, v1.8h, v7.8h
> +
> +	uaddl	v16.4s, v2.4h, v8.4h
> +	uaddl2	v17.4s, v2.8h, v8.8h
> +
> +	uaddl	v18.4s, v3.4h, v9.4h
> +	uaddl2	v19.4s, v3.8h, v9.8h
> +
> +	uaddl	v20.4s, v4.4h, v10.4h
> +	uaddl2	v21.4s, v4.8h, v10.8h
> +	uaddl	v22.4s, v5.4h, v11.4h
> +	uaddl2	v23.4s, v5.8h, v11.8h
> +
> +	add	v0.4s, v12.4s, v13.4s
> +	add	v1.4s, v14.4s, v15.4s
> +	add	v2.4s, v16.4s, v17.4s
> +	add	v3.4s, v18.4s, v19.4s
> +	add	v4.4s, v20.4s, v21.4s
> +	add	v5.4s, v22.4s, v23.4s
> +
> +	sub	x1, x1, #96
> +	cmp	x1, #96
> +	b.ge	do_loop_96
> +
> +	add	v0.4s, v0.4s, v1.4s
> +	add	v2.4s, v2.4s, v3.4s
> +	add	v4.4s, v4.4s, v5.4s
> +	add	v0.4s, v0.4s, v2.4s
> +	add	v0.4s, v0.4s, v4.4s     //get result
> +
> +	cmp	x1, #16
> +	b.lt	get_64
> +
> +do_loop_16:
> +	ldr	q6, [x0], #16
> +	uaddl	v24.4s, v0.4h, v6.4h
> +	uaddl2	v25.4s, v0.8h, v6.8h
> +	add	v0.4s, v24.4s, v25.4s
> +	sub	x1, x1, #16
> +	cmp	x1, #16
> +	b.ge	do_loop_16
> +
> +get_64:
> +	mov	x6, v0.d[0]
> +	add	x5, x5, x6
> +	mov	x6, v0.d[1]
> +
> +	add	x5, x5, x6
> +	cmp	x5, x6
> +	b.ge	len_4
> +	add	x5, x5, #1
> +
> +len_4:
> +	cmp	x1, #4
> +	b.lt	len_2
> +
> +	sub	x1, x1, #4
> +	ldr	w6, [x0], #4
> +	and	x6, x6, #0xffffffff
> +	add	x5, x5, x6
> +	b	len_4
> +
> +len_2:
> +	cmp	x1, #2
> +	b.lt	len_1
> +	sub	x1, x1, #2
> +	ldrh	w6, [x0], #2
> +	and	x6, x6, x13
> +	add	x5, x5, x6
> +
> +len_1:
> +	cmp	x1, #1
> +	b.lt	fold_32
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	lsl	x6, x6, #8
> +	and	x6, x6, x13
> +#else
> +	and	x6, x6, #0xff
> +#endif
> +	add	x5, x5, x6
> +
> +fold_32:
> +	and	x9, x5, x13		//[15:0]
> +	and	x10, x13, x5, lsr #16	//[31:16]
> +	and	x11, x13, x5, lsr #32	//[47:32]
> +	and	x12, x13, x5, lsr #48	//[47:32]
> +
> +	add	x9, x9, x10
> +	add	x11, x11, x12
> +
> +	add	x9, x9, x11
> +
> +	and	x10, x9, x13
> +	and	x11, x13, x9, lsr #16
> +
> +	add	x5, x10, x11
> +
> +	and     x9, x5, x13             //add carry
> +	and     x10, x13, x5, lsr #16
> +	add	x5, x9, x10
> +
> +	cbz	x4, out			//addr isn't odd
> +
> +	lsr	x6, x5, #8
> +	and	x6, x6, #0xff
> +	and	x7, x5, #0xff
> +	lsl	x7, x7, #8
> +
> +	orr	x5, x6, x7
> +
> +out:
> +	mov	x0, x5
> +	ret
> +ENDPROC(do_csum)
>

Dave Martin Jan. 23, 2018, 11:05 a.m. UTC | #3

On Tue, Jan 16, 2018 at 01:07:01AM +0000, chenzhou wrote:
> Hi Robin,
> 
> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy@arm.com] 
> Sent: Friday, January 12, 2018 8:23 PM
> To: chenzhou; catalin.marinas@arm.com; will.deacon@arm.com
> Cc: linux-arm-kernel@lists.infradead.org
> Subject: Re: [PATCH] arm64: support do_csum with neon
> 
> On 12/01/18 11:53, Chen Zhou wrote:
> > On arm64 little endian such as the Cortex-A57, the neon based 
> > implementation performance increases by about 70% when len is greater 
> > than 512.
> 
> Um, I don't see the kernel-mode NEON infrastructure being used anywhere here. Blindly destroying someone else's register context is never going to end well, regardless of how fast you can do it...
> 
> Robin.
> 
> Thank you very much for your review. You're right. I don't think about whether the
> systems support NEON and don't put kernel_neon_begin and kernel_neon_end
> calls around NEON code. I will fix this up later.

Can do_csum() be called from any context?

Kernel-mode NEON cannot be used from all contexts, and cannot be nested:

 * thread context: yes, if hardware supports it (elf_hwcap & HWCAP_ASIMD)
 * softirq context: yes, if hardware supports it AND kernel-mode NEON is
	not currently in use in the interrupted thread:
	check (elf_hwcap & HWCAP_ASIMD) && may_use_simd().
 * other contexts (irq, nmi etc.): no
	may_use_simd() returns false for this case.

You will likely need to write some wrapper C code selects between the
NEON-optimised and C implementations, and does the appropriate runtime
checks.

See arch/arm64/crypto/sha256-glue.c for an example.


Feel free to ask if you're confused, and Cc me and/or Ard Biesheuvel on
the patches.

Possibly we should write some documentation on kernel_mode_neon()...

Cheers
---Dave


> 
> Thanks
> Chen Zhou
> 
> > Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> > ---
> >   arch/arm64/include/asm/checksum.h |   3 +
> >   arch/arm64/lib/Makefile           |   1 +
> >   arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
> >   3 files changed, 181 insertions(+)
> >   create mode 100644 arch/arm64/lib/do_csum.S
> > 
> > diff --git a/arch/arm64/include/asm/checksum.h 
> > b/arch/arm64/include/asm/checksum.h
> > index 09f6533..e300782 100644
> > --- a/arch/arm64/include/asm/checksum.h
> > +++ b/arch/arm64/include/asm/checksum.h
> > @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
> >   }
> >   #define csum_fold csum_fold
> >   
> > +#define do_csum do_csum
> > +extern unsigned int do_csum(const unsigned char *, size_t);
> > +
> >   static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> >   {
> >   	__uint128_t tmp;
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 
> > 1a811ec..5b6aa34 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -3,6 +3,7 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
> >   		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
> >   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
> >   		   strchr.o strrchr.o
> > +lib-y += do_csum.o
> >   
> >   # Tell the compiler to treat all general purpose registers as
> >   # callee-saved, which allows for efficient runtime patching of the 
> > bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S 
> > new file mode 100644 index 0000000..8e7b486
> > --- /dev/null
> > +++ b/arch/arm64/lib/do_csum.S
> > @@ -0,0 +1,177 @@
> > +/*
> > + * Optmized version of the standard do_csum() function
> > + *
> > + * Parameters:
> > + *	x0 - address of buffer to checksum (const unsigned char *)
> > + *	x1 - length of the buffer (int)
> > + * Returns:
> > + *	x0 - the return checksum of the buffer
> > + */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/assembler.h>
> > +
> > +ENTRY(do_csum)
> > +	ldr	x13, =0xffff
> > +	eor	x4, x4, x4
> > +	eor	x5, x5, x5
> > +	eor	v0.16b, v0.16b, v0.16b
> > +
> > +	//len is zero or negative
> > +	and	x6, x1, #0x80000000
> > +	cmp	x6, #0
> > +	b.gt	out
> > +	cbz	w1, out
> > +
> > +	tst	x0, #1
> > +	b.eq	addr_not_odd
> > +
> > +	//addr is odd
> > +	mov	x4, #1
> > +	ldr	x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > +	and     x6, x6, #0xff
> > +#else
> > +	lsl   x6, x6, #8
> > +	and   x6, x6, x13
> > +#endif
> > +	add     x5, x5, x6
> > +	sub     x1, x1, #1
> > +
> > +addr_not_odd:
> > +	cmp	x1, #32
> > +	b.lt	len_4
> > +	cmp	x1, #192
> > +	b.ge	len_than_192
> > +	b	do_loop_16
> > +
> > +len_than_192:
> > +	ldp	q1, q0, [x0], #32
> > +	ldp	q3, q2, [x0], #32
> > +	ldp	q5, q4, [x0], #32
> > +	sub	x1, x1, #96
> > +
> > +do_loop_96:
> > +	ldp	q7, q6, [x0], #32
> > +	ldp	q9, q8, [x0], #32
> > +	ldp	q11, q10, [x0], #32
> > +
> > +	uaddl	v12.4s, v0.4h, v6.4h
> > +	uaddl2	v13.4s, v0.8h, v6.8h
> > +
> > +	uaddl	v14.4s, v1.4h, v7.4h
> > +	uaddl2	v15.4s, v1.8h, v7.8h
> > +
> > +	uaddl	v16.4s, v2.4h, v8.4h
> > +	uaddl2	v17.4s, v2.8h, v8.8h
> > +
> > +	uaddl	v18.4s, v3.4h, v9.4h
> > +	uaddl2	v19.4s, v3.8h, v9.8h
> > +
> > +	uaddl	v20.4s, v4.4h, v10.4h
> > +	uaddl2	v21.4s, v4.8h, v10.8h
> > +	uaddl	v22.4s, v5.4h, v11.4h
> > +	uaddl2	v23.4s, v5.8h, v11.8h
> > +
> > +	add	v0.4s, v12.4s, v13.4s
> > +	add	v1.4s, v14.4s, v15.4s
> > +	add	v2.4s, v16.4s, v17.4s
> > +	add	v3.4s, v18.4s, v19.4s
> > +	add	v4.4s, v20.4s, v21.4s
> > +	add	v5.4s, v22.4s, v23.4s
> > +
> > +	sub	x1, x1, #96
> > +	cmp	x1, #96
> > +	b.ge	do_loop_96
> > +
> > +	add	v0.4s, v0.4s, v1.4s
> > +	add	v2.4s, v2.4s, v3.4s
> > +	add	v4.4s, v4.4s, v5.4s
> > +	add	v0.4s, v0.4s, v2.4s
> > +	add	v0.4s, v0.4s, v4.4s     //get result
> > +
> > +	cmp	x1, #16
> > +	b.lt	get_64
> > +
> > +do_loop_16:
> > +	ldr	q6, [x0], #16
> > +	uaddl	v24.4s, v0.4h, v6.4h
> > +	uaddl2	v25.4s, v0.8h, v6.8h
> > +	add	v0.4s, v24.4s, v25.4s
> > +	sub	x1, x1, #16
> > +	cmp	x1, #16
> > +	b.ge	do_loop_16
> > +
> > +get_64:
> > +	mov	x6, v0.d[0]
> > +	add	x5, x5, x6
> > +	mov	x6, v0.d[1]
> > +
> > +	add	x5, x5, x6
> > +	cmp	x5, x6
> > +	b.ge	len_4
> > +	add	x5, x5, #1
> > +
> > +len_4:
> > +	cmp	x1, #4
> > +	b.lt	len_2
> > +
> > +	sub	x1, x1, #4
> > +	ldr	w6, [x0], #4
> > +	and	x6, x6, #0xffffffff
> > +	add	x5, x5, x6
> > +	b	len_4
> > +
> > +len_2:
> > +	cmp	x1, #2
> > +	b.lt	len_1
> > +	sub	x1, x1, #2
> > +	ldrh	w6, [x0], #2
> > +	and	x6, x6, x13
> > +	add	x5, x5, x6
> > +
> > +len_1:
> > +	cmp	x1, #1
> > +	b.lt	fold_32
> > +	ldr	x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > +	lsl	x6, x6, #8
> > +	and	x6, x6, x13
> > +#else
> > +	and	x6, x6, #0xff
> > +#endif
> > +	add	x5, x5, x6
> > +
> > +fold_32:
> > +	and	x9, x5, x13		//[15:0]
> > +	and	x10, x13, x5, lsr #16	//[31:16]
> > +	and	x11, x13, x5, lsr #32	//[47:32]
> > +	and	x12, x13, x5, lsr #48	//[47:32]
> > +
> > +	add	x9, x9, x10
> > +	add	x11, x11, x12
> > +
> > +	add	x9, x9, x11
> > +
> > +	and	x10, x9, x13
> > +	and	x11, x13, x9, lsr #16
> > +
> > +	add	x5, x10, x11
> > +
> > +	and     x9, x5, x13             //add carry
> > +	and     x10, x13, x5, lsr #16
> > +	add	x5, x9, x10
> > +
> > +	cbz	x4, out			//addr isn't odd
> > +
> > +	lsr	x6, x5, #8
> > +	and	x6, x6, #0xff
> > +	and	x7, x5, #0xff
> > +	lsl	x7, x7, #8
> > +
> > +	orr	x5, x6, x7
> > +
> > +out:
> > +	mov	x0, x5
> > +	ret
> > +ENDPROC(do_csum)
> > 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 09f6533..e300782 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -26,6 +26,9 @@  static inline __sum16 csum_fold(__wsum csum)
 }
 #define csum_fold csum_fold
 
+#define do_csum do_csum
+extern unsigned int do_csum(const unsigned char *, size_t);
+
 static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 {
 	__uint128_t tmp;
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 1a811ec..5b6aa34 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,6 +3,7 @@  lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o
+lib-y += do_csum.o
 
 # Tell the compiler to treat all general purpose registers as
 # callee-saved, which allows for efficient runtime patching of the bl
diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
new file mode 100644
index 0000000..8e7b486
--- /dev/null
+++ b/arch/arm64/lib/do_csum.S
@@ -0,0 +1,177 @@ 
+/*
+ * Optmized version of the standard do_csum() function
+ *
+ * Parameters:
+ *	x0 - address of buffer to checksum (const unsigned char *)
+ *	x1 - length of the buffer (int)
+ * Returns:
+ *	x0 - the return checksum of the buffer
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(do_csum)
+	ldr	x13, =0xffff
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	v0.16b, v0.16b, v0.16b
+
+	//len is zero or negative
+	and	x6, x1, #0x80000000
+	cmp	x6, #0
+	b.gt	out
+	cbz	w1, out
+
+	tst	x0, #1
+	b.eq	addr_not_odd
+
+	//addr is odd
+	mov	x4, #1
+	ldr	x6, [x0], #1
+#ifdef __AARCH64EB__
+	and     x6, x6, #0xff
+#else
+	lsl   x6, x6, #8
+	and   x6, x6, x13
+#endif
+	add     x5, x5, x6
+	sub     x1, x1, #1
+
+addr_not_odd:
+	cmp	x1, #32
+	b.lt	len_4
+	cmp	x1, #192
+	b.ge	len_than_192
+	b	do_loop_16
+
+len_than_192:
+	ldp	q1, q0, [x0], #32
+	ldp	q3, q2, [x0], #32
+	ldp	q5, q4, [x0], #32
+	sub	x1, x1, #96
+
+do_loop_96:
+	ldp	q7, q6, [x0], #32
+	ldp	q9, q8, [x0], #32
+	ldp	q11, q10, [x0], #32
+
+	uaddl	v12.4s, v0.4h, v6.4h
+	uaddl2	v13.4s, v0.8h, v6.8h
+
+	uaddl	v14.4s, v1.4h, v7.4h
+	uaddl2	v15.4s, v1.8h, v7.8h
+
+	uaddl	v16.4s, v2.4h, v8.4h
+	uaddl2	v17.4s, v2.8h, v8.8h
+
+	uaddl	v18.4s, v3.4h, v9.4h
+	uaddl2	v19.4s, v3.8h, v9.8h
+
+	uaddl	v20.4s, v4.4h, v10.4h
+	uaddl2	v21.4s, v4.8h, v10.8h
+	uaddl	v22.4s, v5.4h, v11.4h
+	uaddl2	v23.4s, v5.8h, v11.8h
+
+	add	v0.4s, v12.4s, v13.4s
+	add	v1.4s, v14.4s, v15.4s
+	add	v2.4s, v16.4s, v17.4s
+	add	v3.4s, v18.4s, v19.4s
+	add	v4.4s, v20.4s, v21.4s
+	add	v5.4s, v22.4s, v23.4s
+
+	sub	x1, x1, #96
+	cmp	x1, #96
+	b.ge	do_loop_96
+
+	add	v0.4s, v0.4s, v1.4s
+	add	v2.4s, v2.4s, v3.4s
+	add	v4.4s, v4.4s, v5.4s
+	add	v0.4s, v0.4s, v2.4s
+	add	v0.4s, v0.4s, v4.4s     //get result
+
+	cmp	x1, #16
+	b.lt	get_64
+
+do_loop_16:
+	ldr	q6, [x0], #16
+	uaddl	v24.4s, v0.4h, v6.4h
+	uaddl2	v25.4s, v0.8h, v6.8h
+	add	v0.4s, v24.4s, v25.4s
+	sub	x1, x1, #16
+	cmp	x1, #16
+	b.ge	do_loop_16
+
+get_64:
+	mov	x6, v0.d[0]
+	add	x5, x5, x6
+	mov	x6, v0.d[1]
+
+	add	x5, x5, x6
+	cmp	x5, x6
+	b.ge	len_4
+	add	x5, x5, #1
+
+len_4:
+	cmp	x1, #4
+	b.lt	len_2
+
+	sub	x1, x1, #4
+	ldr	w6, [x0], #4
+	and	x6, x6, #0xffffffff
+	add	x5, x5, x6
+	b	len_4
+
+len_2:
+	cmp	x1, #2
+	b.lt	len_1
+	sub	x1, x1, #2
+	ldrh	w6, [x0], #2
+	and	x6, x6, x13
+	add	x5, x5, x6
+
+len_1:
+	cmp	x1, #1
+	b.lt	fold_32
+	ldr	x6, [x0], #1
+#ifdef __AARCH64EB__
+	lsl	x6, x6, #8
+	and	x6, x6, x13
+#else
+	and	x6, x6, #0xff
+#endif
+	add	x5, x5, x6
+
+fold_32:
+	and	x9, x5, x13		//[15:0]
+	and	x10, x13, x5, lsr #16	//[31:16]
+	and	x11, x13, x5, lsr #32	//[47:32]
+	and	x12, x13, x5, lsr #48	//[47:32]
+
+	add	x9, x9, x10
+	add	x11, x11, x12
+
+	add	x9, x9, x11
+
+	and	x10, x9, x13
+	and	x11, x13, x9, lsr #16
+
+	add	x5, x10, x11
+
+	and     x9, x5, x13             //add carry
+	and     x10, x13, x5, lsr #16
+	add	x5, x9, x10
+
+	cbz	x4, out			//addr isn't odd
+
+	lsr	x6, x5, #8
+	and	x6, x6, #0xff
+	and	x7, x5, #0xff
+	lsl	x7, x7, #8
+
+	orr	x5, x6, x7
+
+out:
+	mov	x0, x5
+	ret
+ENDPROC(do_csum)

arm64: support do_csum with neon

Commit Message

Comments

Patch