diff mbox series

[v2] arm64: lib: accelerate do_csum with NEON instruction

Message ID 1546518769-114424-1-git-send-email-huanglingyan2@huawei.com (mailing list archive)
State New, archived
Headers show
Series [v2] arm64: lib: accelerate do_csum with NEON instruction | expand

Commit Message

huanglingyan (A) Jan. 3, 2019, 12:32 p.m. UTC
Function do_csum() in lib/checksum.c is used to compute checksum,
which is turned out to be slowly and costs a lot of resources.
Let's use neon instructions to accelerate the checksum computation
for arm64.

------
V1 ==> V2:
    Change NEON assembly code to NEON intrinsic code which is built
    on top of arm_neon.h to avoid dropping into assembly.
------

Here is the comparison results of function ip_compute_csum() between
general do_csum() and neon instruction do_csum().

        len(1000cycle)      general(ns)        do_csum_neon(ns)
          64B:                58060                 59460
         128B:                82930                 83930
         256B:               132480                 73570
         512B:               230100                 86230
        1024B:               426600                 98200


---
 arch/arm64/lib/Makefile        |   4 ++
 arch/arm64/lib/checksum.c      | 140 +++++++++++++++++++++++++++++++++++++++++
 include/asm-generic/checksum.h |   1 +
 lib/checksum.c                 |   8 ++-
 4 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/checksum.c

Comments

Ard Biesheuvel Jan. 3, 2019, 6:19 p.m. UTC | #1
On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote:
>
> Function do_csum() in lib/checksum.c is used to compute checksum,
> which is turned out to be slowly and costs a lot of resources.
> Let's use neon instructions to accelerate the checksum computation
> for arm64.
>
> ------
> V1 ==> V2:
>     Change NEON assembly code to NEON intrinsic code which is built
>     on top of arm_neon.h to avoid dropping into assembly.
> ------
>
> Here is the comparison results of function ip_compute_csum() between
> general do_csum() and neon instruction do_csum().
>
>         len(1000cycle)      general(ns)        do_csum_neon(ns)
>           64B:                58060                 59460
>          128B:                82930                 83930
>          256B:               132480                 73570
>          512B:               230100                 86230
>         1024B:               426600                 98200
>

Very nice! Which CPU did you test this on?

>
> ---
>  arch/arm64/lib/Makefile        |   4 ++
>  arch/arm64/lib/checksum.c      | 140 +++++++++++++++++++++++++++++++++++++++++
>  include/asm-generic/checksum.h |   1 +
>  lib/checksum.c                 |   8 ++-
>  4 files changed, 152 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/lib/checksum.c
>
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 5540a16..ec2fcd3 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>  obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
>  CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
>  CFLAGS_xor-neon.o              += -ffreestanding
> +
> +obj-y    += checksum.o
> +CFLAGS_REMOVE_checksum.o       += -mgeneral-regs-only
> +CFLAGS_checksum.o              += -ffreestanding
>  endif
>
>  # Tell the compiler to treat all general purpose registers (with the
> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
> new file mode 100644
> index 0000000..48f4ead
> --- /dev/null
> +++ b/arch/arm64/lib/checksum.c
> @@ -0,0 +1,140 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * arch/arm64/lib/checksum.c

Drop this line

> + *
> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
> + *
> + * Generic C or neon implementation of do_csum operations.
> + * Choose faster neon instructions when NEON is supported.
> + *
> + */
> +
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +#include <asm/checksum.h>
> +#include <asm/byteorder.h>
> +#include <asm/neon-intrinsics.h>
> +
> +#define CSUM_NEON_THRESHOLD 128
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +static inline u32 from64to16(u64 x)
> +{
> +       /* add up 32-bit and 32-bit for 32+c bit */
> +       x = (x & 0xffffffff) + (x >> 32);
> +       /* add up carry.. */
> +       x = (x & 0xffffffff) + (x >> 32);
> +       /* add up 16-bit and 16-bit for 16+c bit */
> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
> +       /* add up carry.. */
> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
> +       return x;
> +}
> +
> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
> +{
> +       unsigned int odd, count;
> +       uint64_t result = 0;
> +       unsigned int count64;
> +       uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
> +
> +       register uint32x4_t v0, v1, v2, v3;
> +
> +       if (unlikely(len <= 0))

len is unsigned so '== 0' is sufficient

> +               return result;
> +
> +       odd = 1 & (unsigned long) buff;

No space after () cast please

> +       if (unlikely(odd)) {
> +               result = *buff;
> +               len--;
> +               buff++;
> +       }
> +
> +       count = len >> 1;
> +       if (count) {
> +               if (2 & (unsigned long) buff) {
> +                       result += *(unsigned short *)buff;
> +                       count--;
> +                       len -= 2;
> +                       buff += 2;
> +               }
> +               count >>= 1;            /* nr of 32-bit words.. */
> +               if (count) {
> +                       if (4 & (unsigned long) buff) {
> +                               result += *(unsigned int *) buff;
> +                               count--;
> +                               len -= 4;
> +                               buff += 4;
> +                       }
> +                       count >>= 1;    /* nr of 64-bit words.. */
> +
> +                       v0 = vzero;
> +                       v1 = vzero;
> +                       v2 = vzero;
> +                       v3 = vzero;
> +
> +                       count64 = count >> 3;  /* compute 64 Byte circle */
> +                       while (count64) {
> +                               v0 = vpadalq_u16(v0,
> +                                       vld1q_u16((uint16_t *)buff + 0));
> +                               v1 = vpadalq_u16(v1,
> +                                       vld1q_u16((uint16_t *)buff + 8));
> +                               v2 = vpadalq_u16(v2,
> +                                       vld1q_u16((uint16_t *)buff + 16));
> +                               v3 = vpadalq_u16(v3,
> +                                       vld1q_u16((uint16_t *)buff + 24));
> +                               buff += 64;
> +                               count64--;
> +                       }
> +                       v0 = vaddq_u32(v0, v1);
> +                       v2 = vaddq_u32(v2, v3);
> +                       v0 = vaddq_u32(v0, v2);
> +
> +                       count %= 8;
> +                       while (count >= 2) { /* compute 16 byte circle */
> +                               v0 = vpadalq_u16(v0,
> +                                       vld1q_u16((uint16_t *)buff + 0));
> +                               buff += 16;
> +                               count -= 2;
> +                       }
> +
> +                       result += vgetq_lane_u32(v0, 0);
> +                       result += vgetq_lane_u32(v0, 1);
> +                       result += vgetq_lane_u32(v0, 2);
> +                       result += vgetq_lane_u32(v0, 3);
> +                       if (count & 1) {
> +                               result += *(unsigned long long *) buff;
> +                               buff += 8;
> +                       }
> +                       if (len & 4) {
> +                               result += *(unsigned int *) buff;
> +                               buff += 4;
> +                       }
> +               }
> +               if (len & 2) {
> +                       result += *(unsigned short *) buff;
> +                       buff += 2;
> +               }
> +       }
> +       if (len & 1)
> +               result += *buff;
> +       result = from64to16(result);
> +       if (odd)
> +               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> +       return result;
> +}
> +#endif
> +
> +
> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
> +{
> +       if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
> +               unsigned int res;
> +
> +               kernel_neon_begin();
> +               res = do_csum_neon(buff, len);
> +               kernel_neon_end();
> +               return res;
> +       } else
> +               return do_csum_generic(buff, len);
> +}
> diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
> index 34785c0..041063c 100644
> --- a/include/asm-generic/checksum.h
> +++ b/include/asm-generic/checksum.h
> @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
>   */
>  extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
>                                         int len, __wsum sum, int *csum_err);
> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>

Surely, we can find a better way to hook up this code than modifying
headers under asm-generic and code in lib/checksum.c

Is arm64 the only arch that has an optimized checksum() function?

>  #ifndef csum_partial_copy_nocheck
>  #define csum_partial_copy_nocheck(src, dst, len, sum)  \
> diff --git a/lib/checksum.c b/lib/checksum.c
> index d3ec93f..83392db 100644
> --- a/lib/checksum.c
> +++ b/lib/checksum.c
> @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
>         return x;
>  }
>
> -static unsigned int do_csum(const unsigned char *buff, int len)
> +unsigned int do_csum_generic(const unsigned char *buff, int len)
>  {
>         int odd;
>         unsigned int result = 0;
> @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
>  out:
>         return result;
>  }
> +
> +static unsigned int do_csum(const unsigned char *buff, int len)
> +{
> +       return do_csum_generic(buff, len);
> +}
> +
>  #endif
>
>  #ifndef ip_fast_csum
> --
> 2.7.4
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
huanglingyan (A) Jan. 5, 2019, 2:20 a.m. UTC | #2
在 2019/1/4 2:19, Ard Biesheuvel 写道:
> On Thu, 3 Jan 2019 at 13:32, Lingyan Huang <huanglingyan2@huawei.com> wrote:
>> Function do_csum() in lib/checksum.c is used to compute checksum,
>> which is turned out to be slowly and costs a lot of resources.
>> Let's use neon instructions to accelerate the checksum computation
>> for arm64.
>>
>> ------
>> V1 ==> V2:
>>     Change NEON assembly code to NEON intrinsic code which is built
>>     on top of arm_neon.h to avoid dropping into assembly.
>> ------
>>
>> Here is the comparison results of function ip_compute_csum() between
>> general do_csum() and neon instruction do_csum().
>>
>>         len(1000cycle)      general(ns)        do_csum_neon(ns)
>>           64B:                58060                 59460
>>          128B:                82930                 83930
>>          256B:               132480                 73570
>>          512B:               230100                 86230
>>         1024B:               426600                 98200
>>
> Very nice! Which CPU did you test this on?


Thank you for your reply. The test platform is Huawei hip08.


>> ---
>>  arch/arm64/lib/Makefile        |   4 ++
>>  arch/arm64/lib/checksum.c      | 140 +++++++++++++++++++++++++++++++++++++++++
>>  include/asm-generic/checksum.h |   1 +
>>  lib/checksum.c                 |   8 ++-
>>  4 files changed, 152 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/arm64/lib/checksum.c
>>
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 5540a16..ec2fcd3 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -9,6 +9,10 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>>  obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
>>  CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
>>  CFLAGS_xor-neon.o              += -ffreestanding
>> +
>> +obj-y    += checksum.o
>> +CFLAGS_REMOVE_checksum.o       += -mgeneral-regs-only
>> +CFLAGS_checksum.o              += -ffreestanding
>>  endif
>>
>>  # Tell the compiler to treat all general purpose registers (with the
>> diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
>> new file mode 100644
>> index 0000000..48f4ead
>> --- /dev/null
>> +++ b/arch/arm64/lib/checksum.c
>> @@ -0,0 +1,140 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * arch/arm64/lib/checksum.c
> Drop this line


OK.


>> + *
>> + * Authors: Lingyan Huang <huanglingyan2@huawei.com>
>> + * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
>> + *
>> + * Generic C or neon implementation of do_csum operations.
>> + * Choose faster neon instructions when NEON is supported.
>> + *
>> + */
>> +
>> +#include <asm/neon.h>
>> +#include <asm/simd.h>
>> +#include <asm/checksum.h>
>> +#include <asm/byteorder.h>
>> +#include <asm/neon-intrinsics.h>
>> +
>> +#define CSUM_NEON_THRESHOLD 128
>> +#ifdef CONFIG_KERNEL_MODE_NEON
>> +static inline u32 from64to16(u64 x)
>> +{
>> +       /* add up 32-bit and 32-bit for 32+c bit */
>> +       x = (x & 0xffffffff) + (x >> 32);
>> +       /* add up carry.. */
>> +       x = (x & 0xffffffff) + (x >> 32);
>> +       /* add up 16-bit and 16-bit for 16+c bit */
>> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
>> +       /* add up carry.. */
>> +       x = ((u32)x & 0xffff) + ((u32)x >> 16);
>> +       return x;
>> +}
>> +
>> +unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
>> +{
>> +       unsigned int odd, count;
>> +       uint64_t result = 0;
>> +       unsigned int count64;
>> +       uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
>> +
>> +       register uint32x4_t v0, v1, v2, v3;
>> +
>> +       if (unlikely(len <= 0))
> len is unsigned so '== 0' is sufficient


OK


>> +               return result;
>> +
>> +       odd = 1 & (unsigned long) buff;
> No space after () cast please


Has modified all such situations in V3 patch.


>> +       if (unlikely(odd)) {
>> +               result = *buff;
>> +               len--;
>> +               buff++;
>> +       }
>> +
>> +       count = len >> 1;
>> +       if (count) {
>> +               if (2 & (unsigned long) buff) {
>> +                       result += *(unsigned short *)buff;
>> +                       count--;
>> +                       len -= 2;
>> +                       buff += 2;
>> +               }
>> +               count >>= 1;            /* nr of 32-bit words.. */
>> +               if (count) {
>> +                       if (4 & (unsigned long) buff) {
>> +                               result += *(unsigned int *) buff;
>> +                               count--;
>> +                               len -= 4;
>> +                               buff += 4;
>> +                       }
>> +                       count >>= 1;    /* nr of 64-bit words.. */
>> +
>> +                       v0 = vzero;
>> +                       v1 = vzero;
>> +                       v2 = vzero;
>> +                       v3 = vzero;
>> +
>> +                       count64 = count >> 3;  /* compute 64 Byte circle */
>> +                       while (count64) {
>> +                               v0 = vpadalq_u16(v0,
>> +                                       vld1q_u16((uint16_t *)buff + 0));
>> +                               v1 = vpadalq_u16(v1,
>> +                                       vld1q_u16((uint16_t *)buff + 8));
>> +                               v2 = vpadalq_u16(v2,
>> +                                       vld1q_u16((uint16_t *)buff + 16));
>> +                               v3 = vpadalq_u16(v3,
>> +                                       vld1q_u16((uint16_t *)buff + 24));
>> +                               buff += 64;
>> +                               count64--;
>> +                       }
>> +                       v0 = vaddq_u32(v0, v1);
>> +                       v2 = vaddq_u32(v2, v3);
>> +                       v0 = vaddq_u32(v0, v2);
>> +
>> +                       count %= 8;
>> +                       while (count >= 2) { /* compute 16 byte circle */
>> +                               v0 = vpadalq_u16(v0,
>> +                                       vld1q_u16((uint16_t *)buff + 0));
>> +                               buff += 16;
>> +                               count -= 2;
>> +                       }
>> +
>> +                       result += vgetq_lane_u32(v0, 0);
>> +                       result += vgetq_lane_u32(v0, 1);
>> +                       result += vgetq_lane_u32(v0, 2);
>> +                       result += vgetq_lane_u32(v0, 3);
>> +                       if (count & 1) {
>> +                               result += *(unsigned long long *) buff;
>> +                               buff += 8;
>> +                       }
>> +                       if (len & 4) {
>> +                               result += *(unsigned int *) buff;
>> +                               buff += 4;
>> +                       }
>> +               }
>> +               if (len & 2) {
>> +                       result += *(unsigned short *) buff;
>> +                       buff += 2;
>> +               }
>> +       }
>> +       if (len & 1)
>> +               result += *buff;
>> +       result = from64to16(result);
>> +       if (odd)
>> +               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
>> +       return result;
>> +}
>> +#endif
>> +
>> +
>> +unsigned int do_csum(const unsigned char *buff, unsigned int len)
>> +{
>> +       if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
>> +               unsigned int res;
>> +
>> +               kernel_neon_begin();
>> +               res = do_csum_neon(buff, len);
>> +               kernel_neon_end();
>> +               return res;
>> +       } else
>> +               return do_csum_generic(buff, len);
>> +}
>> diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
>> index 34785c0..041063c 100644
>> --- a/include/asm-generic/checksum.h
>> +++ b/include/asm-generic/checksum.h
>> @@ -33,6 +33,7 @@ extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
>>   */
>>  extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
>>                                         int len, __wsum sum, int *csum_err);
>> +extern unsigned int do_csum_generic(const unsigned char *buff, int len);
>>
> Surely, we can find a better way to hook up this code than modifying
> headers under asm-generic and code in lib/checksum.c
>
> Is arm64 the only arch that has an optimized checksum() function?


Arm64 is not the only arch. Your suggetion is reasonable so I'm working at looking for

a better way to only modify the arm64 codes.


>>  #ifndef csum_partial_copy_nocheck
>>  #define csum_partial_copy_nocheck(src, dst, len, sum)  \
>> diff --git a/lib/checksum.c b/lib/checksum.c
>> index d3ec93f..83392db 100644
>> --- a/lib/checksum.c
>> +++ b/lib/checksum.c
>> @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
>>         return x;
>>  }
>>
>> -static unsigned int do_csum(const unsigned char *buff, int len)
>> +unsigned int do_csum_generic(const unsigned char *buff, int len)
>>  {
>>         int odd;
>>         unsigned int result = 0;
>> @@ -100,6 +100,12 @@ static unsigned int do_csum(const unsigned char *buff, int len)
>>  out:
>>         return result;
>>  }
>> +
>> +static unsigned int do_csum(const unsigned char *buff, int len)
>> +{
>> +       return do_csum_generic(buff, len);
>> +}
>> +
>>  #endif
>>
>>  #ifndef ip_fast_csum
>> --
>> 2.7.4
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> .
>
diff mbox series

Patch

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a16..ec2fcd3 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -9,6 +9,10 @@  ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
 CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
 CFLAGS_xor-neon.o		+= -ffreestanding
+
+obj-y    += checksum.o
+CFLAGS_REMOVE_checksum.o	+= -mgeneral-regs-only
+CFLAGS_checksum.o		+= -ffreestanding
 endif
 
 # Tell the compiler to treat all general purpose registers (with the
diff --git a/arch/arm64/lib/checksum.c b/arch/arm64/lib/checksum.c
new file mode 100644
index 0000000..48f4ead
--- /dev/null
+++ b/arch/arm64/lib/checksum.c
@@ -0,0 +1,140 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/arm64/lib/checksum.c
+ *
+ * Authors: Lingyan Huang <huanglingyan2@huawei.com>
+ * Copyright (C) 2018 Hisilicon, Inc. All Rights Reserved.
+ *
+ * Generic C or neon implementation of do_csum operations.
+ * Choose faster neon instructions when NEON is supported.
+ *
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/checksum.h>
+#include <asm/byteorder.h>
+#include <asm/neon-intrinsics.h>
+
+#define CSUM_NEON_THRESHOLD 128
+#ifdef CONFIG_KERNEL_MODE_NEON
+static inline u32 from64to16(u64 x)
+{
+	/* add up 32-bit and 32-bit for 32+c bit */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up carry.. */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up 16-bit and 16-bit for 16+c bit */
+	x = ((u32)x & 0xffff) + ((u32)x >> 16);
+	/* add up carry.. */
+	x = ((u32)x & 0xffff) + ((u32)x >> 16);
+	return x;
+}
+
+unsigned int do_csum_neon(const unsigned char *buff, unsigned int len)
+{
+	unsigned int odd, count;
+	uint64_t result = 0;
+	unsigned int count64;
+	uint32x4_t vzero = (uint32x4_t){0, 0, 0, 0};
+
+	register uint32x4_t v0, v1, v2, v3;
+
+	if (unlikely(len <= 0))
+		return result;
+
+	odd = 1 & (unsigned long) buff;
+	if (unlikely(odd)) {
+		result = *buff;
+		len--;
+		buff++;
+	}
+
+	count = len >> 1;
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *)buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;            /* nr of 32-bit words.. */
+		if (count) {
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;    /* nr of 64-bit words.. */
+
+			v0 = vzero;
+			v1 = vzero;
+			v2 = vzero;
+			v3 = vzero;
+
+			count64 = count >> 3;  /* compute 64 Byte circle */
+			while (count64) {
+				v0 = vpadalq_u16(v0,
+					vld1q_u16((uint16_t *)buff + 0));
+				v1 = vpadalq_u16(v1,
+					vld1q_u16((uint16_t *)buff + 8));
+				v2 = vpadalq_u16(v2,
+					vld1q_u16((uint16_t *)buff + 16));
+				v3 = vpadalq_u16(v3,
+					vld1q_u16((uint16_t *)buff + 24));
+				buff += 64;
+				count64--;
+			}
+			v0 = vaddq_u32(v0, v1);
+			v2 = vaddq_u32(v2, v3);
+			v0 = vaddq_u32(v0, v2);
+
+			count %= 8;
+			while (count >= 2) { /* compute 16 byte circle */
+				v0 = vpadalq_u16(v0,
+					vld1q_u16((uint16_t *)buff + 0));
+				buff += 16;
+				count -= 2;
+			}
+
+			result += vgetq_lane_u32(v0, 0);
+			result += vgetq_lane_u32(v0, 1);
+			result += vgetq_lane_u32(v0, 2);
+			result += vgetq_lane_u32(v0, 3);
+			if (count & 1) {
+				result += *(unsigned long long *) buff;
+				buff += 8;
+			}
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = from64to16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	return result;
+}
+#endif
+
+
+unsigned int do_csum(const unsigned char *buff, unsigned int len)
+{
+	if ((len >= CSUM_NEON_THRESHOLD) && may_use_simd()) {
+		unsigned int res;
+
+		kernel_neon_begin();
+		res = do_csum_neon(buff, len);
+		kernel_neon_end();
+		return res;
+	} else
+		return do_csum_generic(buff, len);
+}
diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h
index 34785c0..041063c 100644
--- a/include/asm-generic/checksum.h
+++ b/include/asm-generic/checksum.h
@@ -33,6 +33,7 @@  extern __wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
  */
 extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
 					int len, __wsum sum, int *csum_err);
+extern unsigned int do_csum_generic(const unsigned char *buff, int len);
 
 #ifndef csum_partial_copy_nocheck
 #define csum_partial_copy_nocheck(src, dst, len, sum)	\
diff --git a/lib/checksum.c b/lib/checksum.c
index d3ec93f..83392db 100644
--- a/lib/checksum.c
+++ b/lib/checksum.c
@@ -47,7 +47,7 @@  static inline unsigned short from32to16(unsigned int x)
 	return x;
 }
 
-static unsigned int do_csum(const unsigned char *buff, int len)
+unsigned int do_csum_generic(const unsigned char *buff, int len)
 {
 	int odd;
 	unsigned int result = 0;
@@ -100,6 +100,12 @@  static unsigned int do_csum(const unsigned char *buff, int len)
 out:
 	return result;
 }
+
+static unsigned int do_csum(const unsigned char *buff, int len)
+{
+	return do_csum_generic(buff, len);
+}
+
 #endif
 
 #ifndef ip_fast_csum