diff mbox series

arm64: csum: Optimise IPv6 header checksum

Message ID 3a723a4b08938154c37febe2504f029c4480e53c.1579546194.git.robin.murphy@arm.com (mailing list archive)
State Mainlined
Commit e9c7ddbf8b4b6a291bf3b5bfa7c883235164d9be
Headers show
Series arm64: csum: Optimise IPv6 header checksum | expand

Commit Message

Robin Murphy Jan. 20, 2020, 6:52 p.m. UTC
Throwing our __uint128_t idioms at csum_ipv6_magic() makes it
about 1.3x-2x faster across a range of microarchitecture/compiler
combinations. Not much in absolute terms, but every little helps.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

Before I move on, this seemed like it might be worth touching as well,
comparing what other architectures do.

 arch/arm64/include/asm/checksum.h |  7 ++++++-
 arch/arm64/lib/csum.c             | 27 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

Comments

Will Deacon Jan. 21, 2020, 10:34 a.m. UTC | #1
[+ Shaokun and Lingyan for review and testing feedback]

On Mon, Jan 20, 2020 at 06:52:29PM +0000, Robin Murphy wrote:
> Throwing our __uint128_t idioms at csum_ipv6_magic() makes it
> about 1.3x-2x faster across a range of microarchitecture/compiler
> combinations. Not much in absolute terms, but every little helps.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> Before I move on, this seemed like it might be worth touching as well,
> comparing what other architectures do.
> 
>  arch/arm64/include/asm/checksum.h |  7 ++++++-
>  arch/arm64/lib/csum.c             | 27 +++++++++++++++++++++++++++
>  2 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 8d2a7de39744..b6f7bc6da5fb 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -5,7 +5,12 @@
>  #ifndef __ASM_CHECKSUM_H
>  #define __ASM_CHECKSUM_H
>  
> -#include <linux/types.h>
> +#include <linux/in6.h>
> +
> +#define _HAVE_ARCH_IPV6_CSUM
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> +			const struct in6_addr *daddr,
> +			__u32 len, __u8 proto, __wsum sum);
>  
>  static inline __sum16 csum_fold(__wsum csum)
>  {
> diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
> index 847eb725ce09..4a522e45f23b 100644
> --- a/arch/arm64/lib/csum.c
> +++ b/arch/arm64/lib/csum.c
> @@ -121,3 +121,30 @@ unsigned int do_csum(const unsigned char *buff, int len)
>  
>  	return sum >> 16;
>  }
> +
> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
> +			const struct in6_addr *daddr,
> +			__u32 len, __u8 proto, __wsum csum)
> +{
> +	__uint128_t src, dst;
> +	u64 sum = (__force u64)csum;
> +
> +	src = *(const __uint128_t *)saddr->s6_addr;
> +	dst = *(const __uint128_t *)daddr->s6_addr;
> +
> +	sum += (__force u32)htonl(len);
> +#ifdef __LITTLE_ENDIAN
> +	sum += (u32)proto << 24;
> +#else
> +	sum += proto;
> +#endif
> +	src += (src >> 64) | (src << 64);
> +	dst += (dst >> 64) | (dst << 64);
> +
> +	sum = accumulate(sum, src >> 64);
> +	sum = accumulate(sum, dst >> 64);
> +
> +	sum += ((sum >> 32) | (sum << 32));
> +	return csum_fold((__force __wsum)(sum >> 32));
> +}
> +EXPORT_SYMBOL(csum_ipv6_magic);
> -- 
> 2.23.0.dirty
>
Shaokun Zhang Feb. 3, 2020, 9:29 a.m. UTC | #2
Hi Will/Robin,

My apologies for the slow reply because of the Spring Festival in China.

Robin's idea sounds nice, We will test it later because our machine
broke down.

Thanks,
Shaokun

On 2020/1/21 18:34, Will Deacon wrote:
> [+ Shaokun and Lingyan for review and testing feedback]
> 
> On Mon, Jan 20, 2020 at 06:52:29PM +0000, Robin Murphy wrote:
>> Throwing our __uint128_t idioms at csum_ipv6_magic() makes it
>> about 1.3x-2x faster across a range of microarchitecture/compiler
>> combinations. Not much in absolute terms, but every little helps.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>
>> Before I move on, this seemed like it might be worth touching as well,
>> comparing what other architectures do.
>>
>>  arch/arm64/include/asm/checksum.h |  7 ++++++-
>>  arch/arm64/lib/csum.c             | 27 +++++++++++++++++++++++++++
>>  2 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
>> index 8d2a7de39744..b6f7bc6da5fb 100644
>> --- a/arch/arm64/include/asm/checksum.h
>> +++ b/arch/arm64/include/asm/checksum.h
>> @@ -5,7 +5,12 @@
>>  #ifndef __ASM_CHECKSUM_H
>>  #define __ASM_CHECKSUM_H
>>  
>> -#include <linux/types.h>
>> +#include <linux/in6.h>
>> +
>> +#define _HAVE_ARCH_IPV6_CSUM
>> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>> +			const struct in6_addr *daddr,
>> +			__u32 len, __u8 proto, __wsum sum);
>>  
>>  static inline __sum16 csum_fold(__wsum csum)
>>  {
>> diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
>> index 847eb725ce09..4a522e45f23b 100644
>> --- a/arch/arm64/lib/csum.c
>> +++ b/arch/arm64/lib/csum.c
>> @@ -121,3 +121,30 @@ unsigned int do_csum(const unsigned char *buff, int len)
>>  
>>  	return sum >> 16;
>>  }
>> +
>> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>> +			const struct in6_addr *daddr,
>> +			__u32 len, __u8 proto, __wsum csum)
>> +{
>> +	__uint128_t src, dst;
>> +	u64 sum = (__force u64)csum;
>> +
>> +	src = *(const __uint128_t *)saddr->s6_addr;
>> +	dst = *(const __uint128_t *)daddr->s6_addr;
>> +
>> +	sum += (__force u32)htonl(len);
>> +#ifdef __LITTLE_ENDIAN
>> +	sum += (u32)proto << 24;
>> +#else
>> +	sum += proto;
>> +#endif
>> +	src += (src >> 64) | (src << 64);
>> +	dst += (dst >> 64) | (dst << 64);
>> +
>> +	sum = accumulate(sum, src >> 64);
>> +	sum = accumulate(sum, dst >> 64);
>> +
>> +	sum += ((sum >> 32) | (sum << 32));
>> +	return csum_fold((__force __wsum)(sum >> 32));
>> +}
>> +EXPORT_SYMBOL(csum_ipv6_magic);
>> -- 
>> 2.23.0.dirty
>>
> 
> .
>
chenzhou Feb. 11, 2020, 8:35 a.m. UTC | #3
Hi Will/Robin/Shaokun,

Shaokun's machine broken down, so i tested it.

On KunPeng920 board, the optimised ipv6 header checksum can get
about 1.2 times performance gain and my gcc version is 7.3.0.

Thanks,
Chen Zhou

On 2020/2/3 17:29, Shaokun Zhang wrote:
> Hi Will/Robin,
> 
> My apologies for the slow reply because of the Spring Festival in China. 
> 
> Robin's idea sounds nice, We will test it later because our machine
> broke down.
> 
> Thanks,
> Shaokun
> 
> On 2020/1/21 18:34, Will Deacon wrote:
>> [+ Shaokun and Lingyan for review and testing feedback]
>>
>> On Mon, Jan 20, 2020 at 06:52:29PM +0000, Robin Murphy wrote:
>>> Throwing our __uint128_t idioms at csum_ipv6_magic() makes it
>>> about 1.3x-2x faster across a range of microarchitecture/compiler
>>> combinations. Not much in absolute terms, but every little helps.
>>>
>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>> ---
>>>
>>> Before I move on, this seemed like it might be worth touching as well,
>>> comparing what other architectures do.
>>>
>>>  arch/arm64/include/asm/checksum.h |  7 ++++++-
>>>  arch/arm64/lib/csum.c             | 27 +++++++++++++++++++++++++++
>>>  2 files changed, 33 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
>>> index 8d2a7de39744..b6f7bc6da5fb 100644
>>> --- a/arch/arm64/include/asm/checksum.h
>>> +++ b/arch/arm64/include/asm/checksum.h
>>> @@ -5,7 +5,12 @@
>>>  #ifndef __ASM_CHECKSUM_H
>>>  #define __ASM_CHECKSUM_H
>>>  
>>> -#include <linux/types.h>
>>> +#include <linux/in6.h>
>>> +
>>> +#define _HAVE_ARCH_IPV6_CSUM
>>> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>>> +			const struct in6_addr *daddr,
>>> +			__u32 len, __u8 proto, __wsum sum);
>>>  
>>>  static inline __sum16 csum_fold(__wsum csum)
>>>  {
>>> diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
>>> index 847eb725ce09..4a522e45f23b 100644
>>> --- a/arch/arm64/lib/csum.c
>>> +++ b/arch/arm64/lib/csum.c
>>> @@ -121,3 +121,30 @@ unsigned int do_csum(const unsigned char *buff, int len)
>>>  
>>>  	return sum >> 16;
>>>  }
>>> +
>>> +__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>>> +			const struct in6_addr *daddr,
>>> +			__u32 len, __u8 proto, __wsum csum)
>>> +{
>>> +	__uint128_t src, dst;
>>> +	u64 sum = (__force u64)csum;
>>> +
>>> +	src = *(const __uint128_t *)saddr->s6_addr;
>>> +	dst = *(const __uint128_t *)daddr->s6_addr;
>>> +
>>> +	sum += (__force u32)htonl(len);
>>> +#ifdef __LITTLE_ENDIAN
>>> +	sum += (u32)proto << 24;
>>> +#else
>>> +	sum += proto;
>>> +#endif
>>> +	src += (src >> 64) | (src << 64);
>>> +	dst += (dst >> 64) | (dst << 64);
>>> +
>>> +	sum = accumulate(sum, src >> 64);
>>> +	sum = accumulate(sum, dst >> 64);
>>> +
>>> +	sum += ((sum >> 32) | (sum << 32));
>>> +	return csum_fold((__force __wsum)(sum >> 32));
>>> +}
>>> +EXPORT_SYMBOL(csum_ipv6_magic);
>>> -- 
>>> 2.23.0.dirty
>>>
>>
>> .
>>
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> 
> .
>
Catalin Marinas March 9, 2020, 6:09 p.m. UTC | #4
On Mon, Jan 20, 2020 at 06:52:29PM +0000, Robin Murphy wrote:
> Throwing our __uint128_t idioms at csum_ipv6_magic() makes it
> about 1.3x-2x faster across a range of microarchitecture/compiler
> combinations. Not much in absolute terms, but every little helps.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

Queued for 5.7. Thanks.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
index 8d2a7de39744..b6f7bc6da5fb 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -5,7 +5,12 @@ 
 #ifndef __ASM_CHECKSUM_H
 #define __ASM_CHECKSUM_H
 
-#include <linux/types.h>
+#include <linux/in6.h>
+
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum sum);
 
 static inline __sum16 csum_fold(__wsum csum)
 {
diff --git a/arch/arm64/lib/csum.c b/arch/arm64/lib/csum.c
index 847eb725ce09..4a522e45f23b 100644
--- a/arch/arm64/lib/csum.c
+++ b/arch/arm64/lib/csum.c
@@ -121,3 +121,30 @@  unsigned int do_csum(const unsigned char *buff, int len)
 
 	return sum >> 16;
 }
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum csum)
+{
+	__uint128_t src, dst;
+	u64 sum = (__force u64)csum;
+
+	src = *(const __uint128_t *)saddr->s6_addr;
+	dst = *(const __uint128_t *)daddr->s6_addr;
+
+	sum += (__force u32)htonl(len);
+#ifdef __LITTLE_ENDIAN
+	sum += (u32)proto << 24;
+#else
+	sum += proto;
+#endif
+	src += (src >> 64) | (src << 64);
+	dst += (dst >> 64) | (dst << 64);
+
+	sum = accumulate(sum, src >> 64);
+	sum = accumulate(sum, dst >> 64);
+
+	sum += ((sum >> 32) | (sum << 32));
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);