diff mbox series

[v2,1/5] riscv: Checksum header

Message ID 20230905-optimize_checksum-v2-1-ccd658db743b@rivosinc.com (mailing list archive)
State Superseded
Headers show
Series riscv: Add fine-tuned checksum functions | expand

Checks

Context Check Description
conchuod/cover_letter success Series has a cover letter
conchuod/tree_selection success Guessed tree name to be for-next at HEAD 9e2864968816
conchuod/fixes_present success Fixes tag not required for -next series
conchuod/maintainers_pattern success MAINTAINERS pattern errors before the patch: 2 and now 2
conchuod/verify_signedoff success Signed-off-by tag matches author and committer
conchuod/kdoc success Errors and warnings before: 0 this patch: 0
conchuod/build_rv64_clang_allmodconfig fail Errors and warnings before: 9 this patch: 1506
conchuod/module_param success Was 0 now: 0
conchuod/build_rv64_gcc_allmodconfig fail Errors and warnings before: 9 this patch: 231
conchuod/build_rv32_defconfig success Build OK
conchuod/dtb_warn_rv64 success Errors and warnings before: 39 this patch: 39
conchuod/header_inline success No static functions without inline keyword in header files
conchuod/checkpatch warning WARNING: Avoid line continuations in quoted strings WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: do not add new typedefs WARNING: unnecessary whitespace before a quoted newline
conchuod/build_rv64_nommu_k210_defconfig success Build OK
conchuod/verify_fixes success No Fixes tag
conchuod/build_rv64_nommu_virt_defconfig success Build OK

Commit Message

Charlie Jenkins Sept. 6, 2023, 4:46 a.m. UTC
Provide checksum algorithms that have been designed to leverage riscv
instructions such as rotate. In 64-bit, can take advantage of the larger
register to avoid some overflow checking.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
---
 arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

Comments

Conor Dooley Sept. 7, 2023, 9:40 a.m. UTC | #1
Hey Charlie,

Just a passing thought that I can't really test since I am meant to be
on holidays...

On Tue, Sep 05, 2023 at 09:46:50PM -0700, Charlie Jenkins wrote:
> Provide checksum algorithms that have been designed to leverage riscv
> instructions such as rotate. In 64-bit, can take advantage of the larger
> register to avoid some overflow checking.
> 
> Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> ---
>  arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 96 insertions(+)
> 
> diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
> new file mode 100644
> index 000000000000..573714b9ea15
> --- /dev/null
> +++ b/arch/riscv/include/asm/checksum.h
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * IP checksum routines
> + *
> + * Copyright (C) 2023 Rivos Inc.
> + */
> +#ifndef __ASM_RISCV_CHECKSUM_H
> +#define __ASM_RISCV_CHECKSUM_H
> +
> +#include <linux/in6.h>
> +#include <linux/uaccess.h>
> +
> +#ifdef CONFIG_32BIT
> +typedef unsigned int csum_t;
> +#else
> +typedef unsigned long csum_t;
> +#endif
> +
> +/*
> + *	Fold a partial checksum without adding pseudo headers
> + */
> +static inline __sum16 csum_fold(__wsum sum)
> +{
> +	sum += (sum >> 16) | (sum << 16);
> +	return (__force __sum16)(~(sum >> 16));
> +}
> +
> +#define csum_fold csum_fold
> +
> +/*
> + * Quickly compute an IP checksum with the assumption that IPv4 headers will
> + * always be in multiples of 32-bits, and have an ihl of at least 5.
> + * @ihl is the number of 32 bit segments and must be greater than or equal to 5.
> + * @iph is assumed to be word aligned.
> + */
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> +	csum_t csum = 0;
> +	int pos = 0;
> +
> +	do {
> +		csum += ((const unsigned int *)iph)[pos];
> +#ifdef CONFIG_32BIT
> +		csum += csum < ((const unsigned int *)iph)[pos];
> +#endif // !CONFIG_32BIT

Some of this ifdeffery really should become IS_ENABLED(), there's
nothing in some of them that can't just get removed by the compiler.

> +	} while (++pos < ihl);
> +
> +#ifdef CONFIG_RISCV_ISA_ZBB

This here I can't test since I'm supposed to be AFK, but can this also
be an IS_ENABLED()? I know it is guarding code that the toolchain may
not support, but does it get removed before that matters?

I mainly ask because there's a host of ifdeffery here & the code would
be a lot easier to understand if we could cut it down to a minimum.

> +	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
> +		csum_t fold_temp;
> +
> +		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
> +					      RISCV_ISA_EXT_ZBB, 1)
> +		    :
> +		    :
> +		    :
> +		    : no_zbb);
> +#ifdef CONFIG_32BIT
> +		asm(".option push				\n\
> +		.option arch,+zbb				\n\
> +			rori %[fold_temp],%[csum],16		\n\
> +			add %[csum],%[fold_temp],%[csum]	\n\
> +		.option pop"
> +		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
> +#else // !CONFIG_32BIT
> +		asm(".option push				\n\
> +		.option arch,+zbb				\n\
> +		    rori %[fold_temp], %[csum], 32		\n\
> +		    add %[csum], %[fold_temp], %[csum]		\n\
> +		    srli %[csum], %[csum], 32			\n\
> +		    roriw %[fold_temp], %[csum], 16		\n\
> +		    addw %[csum], %[fold_temp], %[csum]		\n\
> +		.option pop"
> +		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
> +#endif // !CONFIG_32BIT

These 3 I think are pretty easy to follow immediately, but...

> +		return ~(csum >> 16);
> +	}
> +	/*
> +	 * ZBB only saves three instructions on 32-bit and five on 64-bit so not
> +	 * worth checking if supported without Alternatives.
> +	 */
> +no_zbb:
> +#endif // CONFIG_RISCV_ISA_ZBB
> +#ifdef CONFIG_32BIT
> +#else // !CONFIG_32BIT
> +	csum += (csum >> 32) | (csum << 32);
> +	csum >>= 16;
> +#endif // !CONFIG_32BIT

...these ones here could be converted too, afaict.

Thanks,
Conor.

> +	return csum_fold((__force __wsum)csum);
> +}
> +
> +#define ip_fast_csum ip_fast_csum
> +
> +#include <asm-generic/checksum.h>
> +
> +#endif // __ASM_RISCV_CHECKSUM_H
> 
> -- 
> 2.42.0
>
Charlie Jenkins Sept. 7, 2023, 5:44 p.m. UTC | #2
On Thu, Sep 07, 2023 at 10:40:54AM +0100, Conor Dooley wrote:
> Hey Charlie,
> 
> Just a passing thought that I can't really test since I am meant to be
> on holidays...
> 
> On Tue, Sep 05, 2023 at 09:46:50PM -0700, Charlie Jenkins wrote:
> > Provide checksum algorithms that have been designed to leverage riscv
> > instructions such as rotate. In 64-bit, can take advantage of the larger
> > register to avoid some overflow checking.
> > 
> > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > ---
> >  arch/riscv/include/asm/checksum.h | 96 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 96 insertions(+)
> > 
> > diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
> > new file mode 100644
> > index 000000000000..573714b9ea15
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/checksum.h
> > @@ -0,0 +1,96 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * IP checksum routines
> > + *
> > + * Copyright (C) 2023 Rivos Inc.
> > + */
> > +#ifndef __ASM_RISCV_CHECKSUM_H
> > +#define __ASM_RISCV_CHECKSUM_H
> > +
> > +#include <linux/in6.h>
> > +#include <linux/uaccess.h>
> > +
> > +#ifdef CONFIG_32BIT
> > +typedef unsigned int csum_t;
> > +#else
> > +typedef unsigned long csum_t;
> > +#endif
> > +
> > +/*
> > + *	Fold a partial checksum without adding pseudo headers
> > + */
> > +static inline __sum16 csum_fold(__wsum sum)
> > +{
> > +	sum += (sum >> 16) | (sum << 16);
> > +	return (__force __sum16)(~(sum >> 16));
> > +}
> > +
> > +#define csum_fold csum_fold
> > +
> > +/*
> > + * Quickly compute an IP checksum with the assumption that IPv4 headers will
> > + * always be in multiples of 32-bits, and have an ihl of at least 5.
> > + * @ihl is the number of 32 bit segments and must be greater than or equal to 5.
> > + * @iph is assumed to be word aligned.
> > + */
> > +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> > +{
> > +	csum_t csum = 0;
> > +	int pos = 0;
> > +
> > +	do {
> > +		csum += ((const unsigned int *)iph)[pos];
> > +#ifdef CONFIG_32BIT
> > +		csum += csum < ((const unsigned int *)iph)[pos];
> > +#endif // !CONFIG_32BIT
> 
> Some of this ifdeffery really should become IS_ENABLED(), there's
> nothing in some of them that can't just get removed by the compiler.
> 
> > +	} while (++pos < ihl);
> > +
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> 
> This here I can't test since I'm supposed to be AFK, but can this also
> be an IS_ENABLED()? I know it is guarding code that the toolchain may
> not support, but does it get removed before that matters?
> 
> I mainly ask because there's a host of ifdeffery here & the code would
> be a lot easier to understand if we could cut it down to a minimum.
> 
> > +	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
> > +		csum_t fold_temp;
> > +
> > +		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
> > +					      RISCV_ISA_EXT_ZBB, 1)
> > +		    :
> > +		    :
> > +		    :
> > +		    : no_zbb);
> > +#ifdef CONFIG_32BIT
> > +		asm(".option push				\n\
> > +		.option arch,+zbb				\n\
> > +			rori %[fold_temp],%[csum],16		\n\
> > +			add %[csum],%[fold_temp],%[csum]	\n\
> > +		.option pop"
> > +		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
> > +#else // !CONFIG_32BIT
> > +		asm(".option push				\n\
> > +		.option arch,+zbb				\n\
> > +		    rori %[fold_temp], %[csum], 32		\n\
> > +		    add %[csum], %[fold_temp], %[csum]		\n\
> > +		    srli %[csum], %[csum], 32			\n\
> > +		    roriw %[fold_temp], %[csum], 16		\n\
> > +		    addw %[csum], %[fold_temp], %[csum]		\n\
> > +		.option pop"
> > +		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
> > +#endif // !CONFIG_32BIT
> 
> These 3 I think are pretty easy to follow immediately, but...
> 
> > +		return ~(csum >> 16);
> > +	}
> > +	/*
> > +	 * ZBB only saves three instructions on 32-bit and five on 64-bit so not
> > +	 * worth checking if supported without Alternatives.
> > +	 */
> > +no_zbb:
> > +#endif // CONFIG_RISCV_ISA_ZBB
> > +#ifdef CONFIG_32BIT
> > +#else // !CONFIG_32BIT
> > +	csum += (csum >> 32) | (csum << 32);
> > +	csum >>= 16;
> > +#endif // !CONFIG_32BIT
> 
> ...these ones here could be converted too, afaict.
> 
> Thanks,
> Conor.
> 
That should make it look much cleaner, I will switch over to IS_ENABLED.

- Charlie
> > +	return csum_fold((__force __wsum)csum);
> > +}
> > +
> > +#define ip_fast_csum ip_fast_csum
> > +
> > +#include <asm-generic/checksum.h>
> > +
> > +#endif // __ASM_RISCV_CHECKSUM_H
> > 
> > -- 
> > 2.42.0
> >
David Laight Sept. 10, 2023, 9:20 p.m. UTC | #3
...
> > > +/*
> > > + *	Fold a partial checksum without adding pseudo headers
> > > + */
> > > +static inline __sum16 csum_fold(__wsum sum)
> > > +{
> > > +	sum += (sum >> 16) | (sum << 16);
> > > +	return (__force __sum16)(~(sum >> 16));
> > > +}

I'm intrigued, gcc normally compiler that quite well.
The very similar (from arch/arc):
	return (~sum - rol32(sum, 16)) >> 16;
is slightly better on most architectures.
(Especially if the ~sum and rol() can be executed together.)

The only odd archs I saw were sparc32 (carry flag bug no rotate)
and arm (barrel shifter on all instructions).

It is better than the current asm for a lot of archs including x64.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Charlie Jenkins Sept. 11, 2023, 6:16 p.m. UTC | #4
On Sun, Sep 10, 2023 at 09:20:33PM +0000, David Laight wrote:
> ...
> > > > +/*
> > > > + *	Fold a partial checksum without adding pseudo headers
> > > > + */
> > > > +static inline __sum16 csum_fold(__wsum sum)
> > > > +{
> > > > +	sum += (sum >> 16) | (sum << 16);
> > > > +	return (__force __sum16)(~(sum >> 16));
> > > > +}
> 
> I'm intrigued, gcc normally compiler that quite well.
> The very similar (from arch/arc):
> 	return (~sum - rol32(sum, 16)) >> 16;
> is slightly better on most architectures.
> (Especially if the ~sum and rol() can be executed together.)
Oh wow that solves the problem of needing to zero extend the result
after taking the not. Taking the shift after the not both zero-extends
and places the result in the correct spot. It is a very interesting
property that it is possible to change the order of the operations if
the addition becomes a subtraction. I will make the change. Thank you!

- Charlie
> 
> The only odd archs I saw were sparc32 (carry flag bug no rotate)
> and arm (barrel shifter on all instructions).
> 
> It is better than the current asm for a lot of archs including x64.
> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
new file mode 100644
index 000000000000..573714b9ea15
--- /dev/null
+++ b/arch/riscv/include/asm/checksum.h
@@ -0,0 +1,96 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * IP checksum routines
+ *
+ * Copyright (C) 2023 Rivos Inc.
+ */
+#ifndef __ASM_RISCV_CHECKSUM_H
+#define __ASM_RISCV_CHECKSUM_H
+
+#include <linux/in6.h>
+#include <linux/uaccess.h>
+
+#ifdef CONFIG_32BIT
+typedef unsigned int csum_t;
+#else
+typedef unsigned long csum_t;
+#endif
+
+/*
+ *	Fold a partial checksum without adding pseudo headers
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+	sum += (sum >> 16) | (sum << 16);
+	return (__force __sum16)(~(sum >> 16));
+}
+
+#define csum_fold csum_fold
+
+/*
+ * Quickly compute an IP checksum with the assumption that IPv4 headers will
+ * always be in multiples of 32-bits, and have an ihl of at least 5.
+ * @ihl is the number of 32 bit segments and must be greater than or equal to 5.
+ * @iph is assumed to be word aligned.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	csum_t csum = 0;
+	int pos = 0;
+
+	do {
+		csum += ((const unsigned int *)iph)[pos];
+#ifdef CONFIG_32BIT
+		csum += csum < ((const unsigned int *)iph)[pos];
+#endif // !CONFIG_32BIT
+	} while (++pos < ihl);
+
+#ifdef CONFIG_RISCV_ISA_ZBB
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+		csum_t fold_temp;
+
+		asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0,
+					      RISCV_ISA_EXT_ZBB, 1)
+		    :
+		    :
+		    :
+		    : no_zbb);
+#ifdef CONFIG_32BIT
+		asm(".option push				\n\
+		.option arch,+zbb				\n\
+			rori %[fold_temp],%[csum],16		\n\
+			add %[csum],%[fold_temp],%[csum]	\n\
+		.option pop"
+		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
+#else // !CONFIG_32BIT
+		asm(".option push				\n\
+		.option arch,+zbb				\n\
+		    rori %[fold_temp], %[csum], 32		\n\
+		    add %[csum], %[fold_temp], %[csum]		\n\
+		    srli %[csum], %[csum], 32			\n\
+		    roriw %[fold_temp], %[csum], 16		\n\
+		    addw %[csum], %[fold_temp], %[csum]		\n\
+		.option pop"
+		: [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp));
+#endif // !CONFIG_32BIT
+		return ~(csum >> 16);
+	}
+	/*
+	 * ZBB only saves three instructions on 32-bit and five on 64-bit so not
+	 * worth checking if supported without Alternatives.
+	 */
+no_zbb:
+#endif // CONFIG_RISCV_ISA_ZBB
+#ifdef CONFIG_32BIT
+#else // !CONFIG_32BIT
+	csum += (csum >> 32) | (csum << 32);
+	csum >>= 16;
+#endif // !CONFIG_32BIT
+	return csum_fold((__force __wsum)csum);
+}
+
+#define ip_fast_csum ip_fast_csum
+
+#include <asm-generic/checksum.h>
+
+#endif // __ASM_RISCV_CHECKSUM_H