diff mbox series

[3/3] riscv: optimized memset

Message ID 20240128111013.2450-4-jszhang@kernel.org (mailing list archive)
State Changes Requested
Headers show
Series riscv: optimize memcpy/memmove/memset | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-3-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-3-test-2 fail .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-3-test-3 fail .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-3-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-3-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-3-test-6 warning .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-3-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-3-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-3-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-3-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-3-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-3-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Jisheng Zhang Jan. 28, 2024, 11:10 a.m. UTC
From: Matteo Croce <mcroce@microsoft.com>

The generic memset is defined as a byte at time write. This is always
safe, but it's slower than a 4 byte or even 8 byte write.

Write a generic memset which fills the data one byte at time until the
destination is aligned, then fills using the largest size allowed,
and finally fills the remaining data one byte at time.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 arch/riscv/include/asm/string.h |   4 +-
 arch/riscv/kernel/riscv_ksyms.c |   2 -
 arch/riscv/lib/Makefile         |   1 -
 arch/riscv/lib/memset.S         | 113 --------------------------------
 arch/riscv/lib/string.c         |  41 ++++++++++++
 arch/riscv/purgatory/Makefile   |   5 +-
 6 files changed, 44 insertions(+), 122 deletions(-)
 delete mode 100644 arch/riscv/lib/memset.S

Comments

Nick Kossifidis Jan. 30, 2024, 12:07 p.m. UTC | #1
On 1/28/24 13:10, Jisheng Zhang wrote:
> diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
> index 20677c8067da..022edda68f1c 100644
> --- a/arch/riscv/lib/string.c
> +++ b/arch/riscv/lib/string.c
> @@ -144,3 +144,44 @@ void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmov
>   EXPORT_SYMBOL(memmove);
>   void *__pi_memmove(void *dest, const void *src, size_t count) __alias(__memmove);
>   void *__pi___memmove(void *dest, const void *src, size_t count) __alias(__memmove);
> +
> +void *__memset(void *s, int c, size_t count)
> +{
> +	union types dest = { .as_u8 = s };
> +
> +	if (count >= MIN_THRESHOLD) {
> +		unsigned long cu = (unsigned long)c;
> +
> +		/* Compose an ulong with 'c' repeated 4/8 times */
> +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
> +		cu *= 0x0101010101010101UL;
> +#else
> +		cu |= cu << 8;
> +		cu |= cu << 16;
> +		/* Suppress warning on 32 bit machines */
> +		cu |= (cu << 16) << 16;
> +#endif

I guess you could check against __SIZEOF_LONG__ here.

> +		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
> +			/*
> +			 * Fill the buffer one byte at time until
> +			 * the destination is word aligned.
> +			 */
> +			for (; count && dest.as_uptr & WORD_MASK; count--)
> +				*dest.as_u8++ = c;
> +		}
> +
> +		/* Copy using the largest size allowed */
> +		for (; count >= BYTES_LONG; count -= BYTES_LONG)
> +			*dest.as_ulong++ = cu;
> +	}
> +
> +	/* copy the remainder */
> +	while (count--)
> +		*dest.as_u8++ = c;
> +
> +	return s;
> +}
> +EXPORT_SYMBOL(__memset);

BTW a similar approach could be used for memchr, e.g.:

#if __SIZEOF_LONG__ == 8
#define HAS_ZERO(_x) (((_x) - 0x0101010101010101ULL) & ~(_x) & 
0x8080808080808080ULL)
#else
#define HAS_ZERO(_x) (((_x) - 0x01010101UL) & ~(_x) & 0x80808080UL)
#endif

void *
memchr(const void *src_ptr, int c, size_t len)
{
	union const_data src = { .as_bytes = src_ptr };
	unsigned char byte = (unsigned char) c;
	unsigned long mask = (unsigned long) c;
	size_t remaining = len;

	/* Nothing to do */
	if (!src_ptr || !len)
		return NULL;

	if (len < 2 * WORD_SIZE)
		goto trailing;

	mask |= mask << 8;
	mask |= mask << 16;
#if __SIZEOF_LONG__ == 8
	mask |= mask << 32;
#endif

	/* Search by byte up to the src's alignment boundary */
	for(; src.as_uptr & WORD_MASK; remaining--, src.as_bytes++) {
		if (*src.as_bytes == byte)
			return (void*) src.as_bytes;
	}

	/* Search word by word using the mask */
	for(; remaining >= WORD_SIZE; remaining -= WORD_SIZE, src.as_ulong++) {
		unsigned long check = *src.as_ulong ^ mask;
		if(HAS_ZERO(check))
			break;
	}

  trailing:
	for(; remaining > 0; remaining--, src.as_bytes++) {
		if (*src.as_bytes == byte)
			return (void*) src.as_bytes;
	}

	return NULL;
}

Regards,
Nick
Jisheng Zhang Jan. 30, 2024, 1:25 p.m. UTC | #2
On Tue, Jan 30, 2024 at 02:07:37PM +0200, Nick Kossifidis wrote:
> On 1/28/24 13:10, Jisheng Zhang wrote:
> > diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
> > index 20677c8067da..022edda68f1c 100644
> > --- a/arch/riscv/lib/string.c
> > +++ b/arch/riscv/lib/string.c
> > @@ -144,3 +144,44 @@ void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmov
> >   EXPORT_SYMBOL(memmove);
> >   void *__pi_memmove(void *dest, const void *src, size_t count) __alias(__memmove);
> >   void *__pi___memmove(void *dest, const void *src, size_t count) __alias(__memmove);
> > +
> > +void *__memset(void *s, int c, size_t count)
> > +{
> > +	union types dest = { .as_u8 = s };
> > +
> > +	if (count >= MIN_THRESHOLD) {
> > +		unsigned long cu = (unsigned long)c;
> > +
> > +		/* Compose an ulong with 'c' repeated 4/8 times */
> > +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
> > +		cu *= 0x0101010101010101UL;

Here we need to check BITS_PER_LONG, use 0x01010101UL for rv32

> > +#else
> > +		cu |= cu << 8;
> > +		cu |= cu << 16;
> > +		/* Suppress warning on 32 bit machines */
> > +		cu |= (cu << 16) << 16;
> > +#endif
> 
> I guess you could check against __SIZEOF_LONG__ here.

Hmm I believe we can remove the | and shift totally, and fall
back to ARCH_HAS_FAST_MULTIPLIER, see
https://lore.kernel.org/linux-riscv/20240125145703.913-1-jszhang@kernel.org/

> 
> > +		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
> > +			/*
> > +			 * Fill the buffer one byte at time until
> > +			 * the destination is word aligned.
> > +			 */
> > +			for (; count && dest.as_uptr & WORD_MASK; count--)
> > +				*dest.as_u8++ = c;
> > +		}
> > +
> > +		/* Copy using the largest size allowed */
> > +		for (; count >= BYTES_LONG; count -= BYTES_LONG)
> > +			*dest.as_ulong++ = cu;
> > +	}
> > +
> > +	/* copy the remainder */
> > +	while (count--)
> > +		*dest.as_u8++ = c;
> > +
> > +	return s;
> > +}
> > +EXPORT_SYMBOL(__memset);
> 
> BTW a similar approach could be used for memchr, e.g.:
> 
> #if __SIZEOF_LONG__ == 8
> #define HAS_ZERO(_x) (((_x) - 0x0101010101010101ULL) & ~(_x) &
> 0x8080808080808080ULL)
> #else
> #define HAS_ZERO(_x) (((_x) - 0x01010101UL) & ~(_x) & 0x80808080UL)
> #endif
> 
> void *
> memchr(const void *src_ptr, int c, size_t len)
> {
> 	union const_data src = { .as_bytes = src_ptr };
> 	unsigned char byte = (unsigned char) c;
> 	unsigned long mask = (unsigned long) c;
> 	size_t remaining = len;
> 
> 	/* Nothing to do */
> 	if (!src_ptr || !len)
> 		return NULL;
> 
> 	if (len < 2 * WORD_SIZE)
> 		goto trailing;
> 
> 	mask |= mask << 8;
> 	mask |= mask << 16;
> #if __SIZEOF_LONG__ == 8
> 	mask |= mask << 32;
> #endif
> 
> 	/* Search by byte up to the src's alignment boundary */
> 	for(; src.as_uptr & WORD_MASK; remaining--, src.as_bytes++) {
> 		if (*src.as_bytes == byte)
> 			return (void*) src.as_bytes;
> 	}
> 
> 	/* Search word by word using the mask */
> 	for(; remaining >= WORD_SIZE; remaining -= WORD_SIZE, src.as_ulong++) {
> 		unsigned long check = *src.as_ulong ^ mask;
> 		if(HAS_ZERO(check))
> 			break;
> 	}
> 
>  trailing:
> 	for(; remaining > 0; remaining--, src.as_bytes++) {
> 		if (*src.as_bytes == byte)
> 			return (void*) src.as_bytes;
> 	}
> 
> 	return NULL;
> }
> 
> Regards,
> Nick
David Laight Feb. 1, 2024, 11:04 p.m. UTC | #3
...
> > +		/* Compose an ulong with 'c' repeated 4/8 times */
> > +#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
> > +		cu *= 0x0101010101010101UL;

That it likely to generate a compile error on 32bit.
Maybe:
		cu *= (unsigned long)0x0101010101010101ULL;
> > +#else
> > +		cu |= cu << 8;
> > +		cu |= cu << 16;
> > +		/* Suppress warning on 32 bit machines */
> > +		cu |= (cu << 16) << 16;
> > +#endif
> 
> I guess you could check against __SIZEOF_LONG__ here.

Or even sizeof (cu), possible as:
		cu |= cu << (sizeof (cu) == 8 ? 32 : 0);
which I'm pretty sure modern compiler will throw away for 32bit.

I do wonder whether CONFIG_ARCH_HAS_FAST_MULTIPLIER is worth
testing - you'd really want to know there is a risc-v cpu
with a multiply that is slower than the shift and or version.
I actually doubt it.
Multiply is used so often (all array indexing) that you
really do need something better than a '1 bit per clock' loop.

It is worth remembering that you can implement an n*n multiply
with n*n 'full adders' (3 input bits, 2 output bits) with a
latency of 2*n adders.
So the latency is only twice that of the corresponding add.
For a modern chip that is not much logic at all.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 17c3b40382e1..3d45d61ddab9 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -10,8 +10,8 @@ 
 #include <linux/linkage.h>
 
 #define __HAVE_ARCH_MEMSET
-extern asmlinkage void *memset(void *, int, size_t);
-extern asmlinkage void *__memset(void *, int, size_t);
+extern void *memset(void *s, int c, size_t count);
+extern void *__memset(void *s, int c, size_t count);
 
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *dest, const void *src, size_t count);
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index 76849d0906ef..9a07cbf67095 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -9,8 +9,6 @@ 
 /*
  * Assembly functions that may be used (directly or indirectly) by modules
  */
-EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strncmp);
-EXPORT_SYMBOL(__memset);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 5fa88c5a601c..b20cadccff78 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,6 +1,5 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y			+= delay.o
-lib-y			+= memset.o
 lib-y			+= strcmp.o
 lib-y			+= strlen.o
 lib-y			+= string.o
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
deleted file mode 100644
index 35f358e70bdb..000000000000
--- a/arch/riscv/lib/memset.S
+++ /dev/null
@@ -1,113 +0,0 @@ 
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memset(void *, int, size_t) */
-SYM_FUNC_START(__memset)
-	move t0, a0  /* Preserve return value */
-
-	/* Defer to byte-oriented fill for small sizes */
-	sltiu a3, a2, 16
-	bnez a3, 4f
-
-	/*
-	 * Round to nearest XLEN-aligned address
-	 * greater than or equal to start address
-	 */
-	addi a3, t0, SZREG-1
-	andi a3, a3, ~(SZREG-1)
-	beq a3, t0, 2f  /* Skip if already aligned */
-	/* Handle initial misalignment */
-	sub a4, a3, t0
-1:
-	sb a1, 0(t0)
-	addi t0, t0, 1
-	bltu t0, a3, 1b
-	sub a2, a2, a4  /* Update count */
-
-2: /* Duff's device with 32 XLEN stores per iteration */
-	/* Broadcast value into all bytes */
-	andi a1, a1, 0xff
-	slli a3, a1, 8
-	or a1, a3, a1
-	slli a3, a1, 16
-	or a1, a3, a1
-#ifdef CONFIG_64BIT
-	slli a3, a1, 32
-	or a1, a3, a1
-#endif
-
-	/* Calculate end address */
-	andi a4, a2, ~(SZREG-1)
-	add a3, t0, a4
-
-	andi a4, a4, 31*SZREG  /* Calculate remainder */
-	beqz a4, 3f            /* Shortcut if no remainder */
-	neg a4, a4
-	addi a4, a4, 32*SZREG  /* Calculate initial offset */
-
-	/* Adjust start address with offset */
-	sub t0, t0, a4
-
-	/* Jump into loop body */
-	/* Assumes 32-bit instruction lengths */
-	la a5, 3f
-#ifdef CONFIG_64BIT
-	srli a4, a4, 1
-#endif
-	add a5, a5, a4
-	jr a5
-3:
-	REG_S a1,        0(t0)
-	REG_S a1,    SZREG(t0)
-	REG_S a1,  2*SZREG(t0)
-	REG_S a1,  3*SZREG(t0)
-	REG_S a1,  4*SZREG(t0)
-	REG_S a1,  5*SZREG(t0)
-	REG_S a1,  6*SZREG(t0)
-	REG_S a1,  7*SZREG(t0)
-	REG_S a1,  8*SZREG(t0)
-	REG_S a1,  9*SZREG(t0)
-	REG_S a1, 10*SZREG(t0)
-	REG_S a1, 11*SZREG(t0)
-	REG_S a1, 12*SZREG(t0)
-	REG_S a1, 13*SZREG(t0)
-	REG_S a1, 14*SZREG(t0)
-	REG_S a1, 15*SZREG(t0)
-	REG_S a1, 16*SZREG(t0)
-	REG_S a1, 17*SZREG(t0)
-	REG_S a1, 18*SZREG(t0)
-	REG_S a1, 19*SZREG(t0)
-	REG_S a1, 20*SZREG(t0)
-	REG_S a1, 21*SZREG(t0)
-	REG_S a1, 22*SZREG(t0)
-	REG_S a1, 23*SZREG(t0)
-	REG_S a1, 24*SZREG(t0)
-	REG_S a1, 25*SZREG(t0)
-	REG_S a1, 26*SZREG(t0)
-	REG_S a1, 27*SZREG(t0)
-	REG_S a1, 28*SZREG(t0)
-	REG_S a1, 29*SZREG(t0)
-	REG_S a1, 30*SZREG(t0)
-	REG_S a1, 31*SZREG(t0)
-	addi t0, t0, 32*SZREG
-	bltu t0, a3, 3b
-	andi a2, a2, SZREG-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, t0, a2
-5:
-	sb a1, 0(t0)
-	addi t0, t0, 1
-	bltu t0, a3, 5b
-6:
-	ret
-SYM_FUNC_END(__memset)
-SYM_FUNC_ALIAS_WEAK(memset, __memset)
diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
index 20677c8067da..022edda68f1c 100644
--- a/arch/riscv/lib/string.c
+++ b/arch/riscv/lib/string.c
@@ -144,3 +144,44 @@  void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmov
 EXPORT_SYMBOL(memmove);
 void *__pi_memmove(void *dest, const void *src, size_t count) __alias(__memmove);
 void *__pi___memmove(void *dest, const void *src, size_t count) __alias(__memmove);
+
+void *__memset(void *s, int c, size_t count)
+{
+	union types dest = { .as_u8 = s };
+
+	if (count >= MIN_THRESHOLD) {
+		unsigned long cu = (unsigned long)c;
+
+		/* Compose an ulong with 'c' repeated 4/8 times */
+#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
+		cu *= 0x0101010101010101UL;
+#else
+		cu |= cu << 8;
+		cu |= cu << 16;
+		/* Suppress warning on 32 bit machines */
+		cu |= (cu << 16) << 16;
+#endif
+		if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+			/*
+			 * Fill the buffer one byte at time until
+			 * the destination is word aligned.
+			 */
+			for (; count && dest.as_uptr & WORD_MASK; count--)
+				*dest.as_u8++ = c;
+		}
+
+		/* Copy using the largest size allowed */
+		for (; count >= BYTES_LONG; count -= BYTES_LONG)
+			*dest.as_ulong++ = cu;
+	}
+
+	/* copy the remainder */
+	while (count--)
+		*dest.as_u8++ = c;
+
+	return s;
+}
+EXPORT_SYMBOL(__memset);
+
+void *memset(void *s, int c, size_t count) __weak __alias(__memset);
+EXPORT_SYMBOL(memset);
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index 8b940ff04895..a31513346951 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -1,7 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0
 OBJECT_FILES_NON_STANDARD := y
 
-purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memset.o
+purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o
 purgatory-y += strcmp.o strlen.o strncmp.o riscv_string.o
 
 targets += $(purgatory-y)
@@ -16,9 +16,6 @@  $(obj)/string.o: $(srctree)/lib/string.c FORCE
 $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-$(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
-	$(call if_changed_rule,as_o_S)
-
 $(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
 	$(call if_changed_rule,as_o_S)