diff mbox series

[v4,4/5] RISC-V: add infrastructure to allow different str* implementations

Message ID 20230109181755.2383085-5-heiko@sntech.de (mailing list archive)
State Superseded
Delegated to: Palmer Dabbelt
Headers show
Series Zbb string optimizations and call support in alternatives | expand

Commit Message

Heiko Stuebner Jan. 9, 2023, 6:17 p.m. UTC
From: Heiko Stuebner <heiko.stuebner@vrull.eu>

Depending on supported extensions on specific RISC-V cores,
optimized str* functions might make sense.

This adds basic infrastructure to allow patching the function calls
via alternatives later on.

The Linux kernel provides standard implementations for string functions
but when architectures want to extend them, they need to provide their
own.

The added generic string functions are done in assembler (taken from
disassembling the main-kernel functions for now) to allow us to control
the used registers and extend them with optimized variants.

Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
---
 arch/riscv/include/asm/string.h | 10 +++++++++
 arch/riscv/kernel/riscv_ksyms.c |  3 +++
 arch/riscv/lib/Makefile         |  3 +++
 arch/riscv/lib/strcmp.S         | 37 ++++++++++++++++++++++++++++++
 arch/riscv/lib/strlen.S         | 28 +++++++++++++++++++++++
 arch/riscv/lib/strncmp.S        | 40 +++++++++++++++++++++++++++++++++
 arch/riscv/purgatory/Makefile   | 13 +++++++++++
 7 files changed, 134 insertions(+)
 create mode 100644 arch/riscv/lib/strcmp.S
 create mode 100644 arch/riscv/lib/strlen.S
 create mode 100644 arch/riscv/lib/strncmp.S

Comments

Conor Dooley Jan. 9, 2023, 10:37 p.m. UTC | #1
On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Depending on supported extensions on specific RISC-V cores,
> optimized str* functions might make sense.
> 
> This adds basic infrastructure to allow patching the function calls
> via alternatives later on.
> 
> The Linux kernel provides standard implementations for string functions
> but when architectures want to extend them, they need to provide their
> own.
> 
> The added generic string functions are done in assembler (taken from
> disassembling the main-kernel functions for now) to allow us to control
> the used registers and extend them with optimized variants.
> 
> Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>

asm looks the same as what I previously provided a review for, but I
assume you dropped cos of the EFI stub stuff? You can have it back I
suppose..
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>

> ---
>  arch/riscv/include/asm/string.h | 10 +++++++++
>  arch/riscv/kernel/riscv_ksyms.c |  3 +++
>  arch/riscv/lib/Makefile         |  3 +++
>  arch/riscv/lib/strcmp.S         | 37 ++++++++++++++++++++++++++++++
>  arch/riscv/lib/strlen.S         | 28 +++++++++++++++++++++++
>  arch/riscv/lib/strncmp.S        | 40 +++++++++++++++++++++++++++++++++
>  arch/riscv/purgatory/Makefile   | 13 +++++++++++
>  7 files changed, 134 insertions(+)
>  create mode 100644 arch/riscv/lib/strcmp.S
>  create mode 100644 arch/riscv/lib/strlen.S
>  create mode 100644 arch/riscv/lib/strncmp.S
> 
> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> index 909049366555..a96b1fea24fe 100644
> --- a/arch/riscv/include/asm/string.h
> +++ b/arch/riscv/include/asm/string.h
> @@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>  #define __HAVE_ARCH_MEMMOVE
>  extern asmlinkage void *memmove(void *, const void *, size_t);
>  extern asmlinkage void *__memmove(void *, const void *, size_t);
> +
> +#define __HAVE_ARCH_STRCMP
> +extern asmlinkage int strcmp(const char *cs, const char *ct);
> +
> +#define __HAVE_ARCH_STRLEN
> +extern asmlinkage __kernel_size_t strlen(const char *);
> +
> +#define __HAVE_ARCH_STRNCMP
> +extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
> +
>  /* For those files which don't want to check by kasan. */
>  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>  #define memcpy(dst, src, len) __memcpy(dst, src, len)
> diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
> index 5ab1c7e1a6ed..a72879b4249a 100644
> --- a/arch/riscv/kernel/riscv_ksyms.c
> +++ b/arch/riscv/kernel/riscv_ksyms.c
> @@ -12,6 +12,9 @@
>  EXPORT_SYMBOL(memset);
>  EXPORT_SYMBOL(memcpy);
>  EXPORT_SYMBOL(memmove);
> +EXPORT_SYMBOL(strcmp);
> +EXPORT_SYMBOL(strlen);
> +EXPORT_SYMBOL(strncmp);
>  EXPORT_SYMBOL(__memset);
>  EXPORT_SYMBOL(__memcpy);
>  EXPORT_SYMBOL(__memmove);
> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> index 25d5c9664e57..6c74b0bedd60 100644
> --- a/arch/riscv/lib/Makefile
> +++ b/arch/riscv/lib/Makefile
> @@ -3,6 +3,9 @@ lib-y			+= delay.o
>  lib-y			+= memcpy.o
>  lib-y			+= memset.o
>  lib-y			+= memmove.o
> +lib-y			+= strcmp.o
> +lib-y			+= strlen.o
> +lib-y			+= strncmp.o
>  lib-$(CONFIG_MMU)	+= uaccess.o
>  lib-$(CONFIG_64BIT)	+= tishift.o
>  
> diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> new file mode 100644
> index 000000000000..94440fb8390c
> --- /dev/null
> +++ b/arch/riscv/lib/strcmp.S
> @@ -0,0 +1,37 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strcmp(const char *cs, const char *ct) */
> +SYM_FUNC_START(strcmp)
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, value like strcmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2
> +	 */
> +	mv	t2, a1
> +1:
> +	lbu	t1, 0(a0)
> +	lbu	t0, 0(a1)
> +	addi	a0, a0, 1
> +	addi	a1, a1, 1
> +	beq	t1, t0, 3f
> +	li	a0, 1
> +	bgeu	t1, t0, 2f
> +	li	a0, -1
> +2:
> +	mv	a1, t2
> +	ret
> +3:
> +	bnez	t1, 1b
> +	li	a0, 0
> +	j	2b
> +SYM_FUNC_END(strcmp)
> diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> new file mode 100644
> index 000000000000..09a7aaff26c8
> --- /dev/null
> +++ b/arch/riscv/lib/strlen.S
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strlen(const char *s) */
> +SYM_FUNC_START(strlen)
> +	/*
> +	 * Returns
> +	 *   a0 - string length
> +	 *
> +	 * Parameters
> +	 *   a0 - String to measure
> +	 *
> +	 * Clobbers:
> +	 *   t0, t1
> +	 */
> +	mv	t1, a0
> +1:
> +	lbu	t0, 0(t1)
> +	bnez	t0, 2f
> +	sub	a0, t1, a0
> +	ret
> +2:
> +	addi	t1, t1, 1
> +	j	1b
> +SYM_FUNC_END(strlen)
> diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> new file mode 100644
> index 000000000000..493ab6febcb2
> --- /dev/null
> +++ b/arch/riscv/lib/strncmp.S
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strncmp(const char *cs, const char *ct, size_t count) */
> +SYM_FUNC_START(strncmp)
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, value like strncmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *   a2 - number of characters to compare
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2
> +	 */
> +	li	t0, 0
> +1:
> +	beq	a2, t0, 4f
> +	add	t1, a0, t0
> +	add	t2, a1, t0
> +	lbu	t1, 0(t1)
> +	lbu	t2, 0(t2)
> +	beq	t1, t2, 3f
> +	li	a0, 1
> +	bgeu	t1, t2, 2f
> +	li	a0, -1
> +2:
> +	ret
> +3:
> +	addi	t0, t0, 1
> +	bnez	t1, 1b
> +4:
> +	li	a0, 0
> +	j	2b
> +SYM_FUNC_END(strncmp)
> diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
> index dd58e1d99397..d16bf715a586 100644
> --- a/arch/riscv/purgatory/Makefile
> +++ b/arch/riscv/purgatory/Makefile
> @@ -2,6 +2,7 @@
>  OBJECT_FILES_NON_STANDARD := y
>  
>  purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
> +purgatory-y += strcmp.o strlen.o strncmp.o
>  
>  targets += $(purgatory-y)
>  PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
> @@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
>  $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
>  	$(call if_changed_rule,as_o_S)
>  
> +$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
> +$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
> +$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
>  $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
>  	$(call if_changed_rule,cc_o_c)
>  
> @@ -77,6 +87,9 @@ CFLAGS_ctype.o			+= $(PURGATORY_CFLAGS)
>  AFLAGS_REMOVE_entry.o		+= -Wa,-gdwarf-2
>  AFLAGS_REMOVE_memcpy.o		+= -Wa,-gdwarf-2
>  AFLAGS_REMOVE_memset.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strcmp.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strlen.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strncmp.o		+= -Wa,-gdwarf-2
>  
>  $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
>  		$(call if_changed,ld)
> -- 
> 2.35.1
>
Heiko Stuebner Jan. 9, 2023, 11:31 p.m. UTC | #2
Am Montag, 9. Januar 2023, 23:37:45 CET schrieb Conor Dooley:
> On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > 
> > Depending on supported extensions on specific RISC-V cores,
> > optimized str* functions might make sense.
> > 
> > This adds basic infrastructure to allow patching the function calls
> > via alternatives later on.
> > 
> > The Linux kernel provides standard implementations for string functions
> > but when architectures want to extend them, they need to provide their
> > own.
> > 
> > The added generic string functions are done in assembler (taken from
> > disassembling the main-kernel functions for now) to allow us to control
> > the used registers and extend them with optimized variants.
> > 
> > Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> asm looks the same as what I previously provided a review for, but I
> assume you dropped cos of the EFI stub stuff? You can have it back I
> suppose..
> Reviewed-by: Conor Dooley <conor.dooley@microchip.com>

That was the

| Technically patch4 got a review from Andrew, but that was still with
| the inline approach, so I didn't bring it over to v4.

part in the cover-letter, thanks for bringing the rb back :-)


Heiko

> > ---
> >  arch/riscv/include/asm/string.h | 10 +++++++++
> >  arch/riscv/kernel/riscv_ksyms.c |  3 +++
> >  arch/riscv/lib/Makefile         |  3 +++
> >  arch/riscv/lib/strcmp.S         | 37 ++++++++++++++++++++++++++++++
> >  arch/riscv/lib/strlen.S         | 28 +++++++++++++++++++++++
> >  arch/riscv/lib/strncmp.S        | 40 +++++++++++++++++++++++++++++++++
> >  arch/riscv/purgatory/Makefile   | 13 +++++++++++
> >  7 files changed, 134 insertions(+)
> >  create mode 100644 arch/riscv/lib/strcmp.S
> >  create mode 100644 arch/riscv/lib/strlen.S
> >  create mode 100644 arch/riscv/lib/strncmp.S
> > 
> > diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> > index 909049366555..a96b1fea24fe 100644
> > --- a/arch/riscv/include/asm/string.h
> > +++ b/arch/riscv/include/asm/string.h
> > @@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
> >  #define __HAVE_ARCH_MEMMOVE
> >  extern asmlinkage void *memmove(void *, const void *, size_t);
> >  extern asmlinkage void *__memmove(void *, const void *, size_t);
> > +
> > +#define __HAVE_ARCH_STRCMP
> > +extern asmlinkage int strcmp(const char *cs, const char *ct);
> > +
> > +#define __HAVE_ARCH_STRLEN
> > +extern asmlinkage __kernel_size_t strlen(const char *);
> > +
> > +#define __HAVE_ARCH_STRNCMP
> > +extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
> > +
> >  /* For those files which don't want to check by kasan. */
> >  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
> >  #define memcpy(dst, src, len) __memcpy(dst, src, len)
> > diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
> > index 5ab1c7e1a6ed..a72879b4249a 100644
> > --- a/arch/riscv/kernel/riscv_ksyms.c
> > +++ b/arch/riscv/kernel/riscv_ksyms.c
> > @@ -12,6 +12,9 @@
> >  EXPORT_SYMBOL(memset);
> >  EXPORT_SYMBOL(memcpy);
> >  EXPORT_SYMBOL(memmove);
> > +EXPORT_SYMBOL(strcmp);
> > +EXPORT_SYMBOL(strlen);
> > +EXPORT_SYMBOL(strncmp);
> >  EXPORT_SYMBOL(__memset);
> >  EXPORT_SYMBOL(__memcpy);
> >  EXPORT_SYMBOL(__memmove);
> > diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> > index 25d5c9664e57..6c74b0bedd60 100644
> > --- a/arch/riscv/lib/Makefile
> > +++ b/arch/riscv/lib/Makefile
> > @@ -3,6 +3,9 @@ lib-y			+= delay.o
> >  lib-y			+= memcpy.o
> >  lib-y			+= memset.o
> >  lib-y			+= memmove.o
> > +lib-y			+= strcmp.o
> > +lib-y			+= strlen.o
> > +lib-y			+= strncmp.o
> >  lib-$(CONFIG_MMU)	+= uaccess.o
> >  lib-$(CONFIG_64BIT)	+= tishift.o
> >  
> > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> > new file mode 100644
> > index 000000000000..94440fb8390c
> > --- /dev/null
> > +++ b/arch/riscv/lib/strcmp.S
> > @@ -0,0 +1,37 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strcmp(const char *cs, const char *ct) */
> > +SYM_FUNC_START(strcmp)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - comparison result, value like strcmp
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - string1
> > +	 *   a1 - string2
> > +	 *
> > +	 * Clobbers
> > +	 *   t0, t1, t2
> > +	 */
> > +	mv	t2, a1
> > +1:
> > +	lbu	t1, 0(a0)
> > +	lbu	t0, 0(a1)
> > +	addi	a0, a0, 1
> > +	addi	a1, a1, 1
> > +	beq	t1, t0, 3f
> > +	li	a0, 1
> > +	bgeu	t1, t0, 2f
> > +	li	a0, -1
> > +2:
> > +	mv	a1, t2
> > +	ret
> > +3:
> > +	bnez	t1, 1b
> > +	li	a0, 0
> > +	j	2b
> > +SYM_FUNC_END(strcmp)
> > diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> > new file mode 100644
> > index 000000000000..09a7aaff26c8
> > --- /dev/null
> > +++ b/arch/riscv/lib/strlen.S
> > @@ -0,0 +1,28 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strlen(const char *s) */
> > +SYM_FUNC_START(strlen)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - string length
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - String to measure
> > +	 *
> > +	 * Clobbers:
> > +	 *   t0, t1
> > +	 */
> > +	mv	t1, a0
> > +1:
> > +	lbu	t0, 0(t1)
> > +	bnez	t0, 2f
> > +	sub	a0, t1, a0
> > +	ret
> > +2:
> > +	addi	t1, t1, 1
> > +	j	1b
> > +SYM_FUNC_END(strlen)
> > diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> > new file mode 100644
> > index 000000000000..493ab6febcb2
> > --- /dev/null
> > +++ b/arch/riscv/lib/strncmp.S
> > @@ -0,0 +1,40 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strncmp(const char *cs, const char *ct, size_t count) */
> > +SYM_FUNC_START(strncmp)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - comparison result, value like strncmp
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - string1
> > +	 *   a1 - string2
> > +	 *   a2 - number of characters to compare
> > +	 *
> > +	 * Clobbers
> > +	 *   t0, t1, t2
> > +	 */
> > +	li	t0, 0
> > +1:
> > +	beq	a2, t0, 4f
> > +	add	t1, a0, t0
> > +	add	t2, a1, t0
> > +	lbu	t1, 0(t1)
> > +	lbu	t2, 0(t2)
> > +	beq	t1, t2, 3f
> > +	li	a0, 1
> > +	bgeu	t1, t2, 2f
> > +	li	a0, -1
> > +2:
> > +	ret
> > +3:
> > +	addi	t0, t0, 1
> > +	bnez	t1, 1b
> > +4:
> > +	li	a0, 0
> > +	j	2b
> > +SYM_FUNC_END(strncmp)
> > diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
> > index dd58e1d99397..d16bf715a586 100644
> > --- a/arch/riscv/purgatory/Makefile
> > +++ b/arch/riscv/purgatory/Makefile
> > @@ -2,6 +2,7 @@
> >  OBJECT_FILES_NON_STANDARD := y
> >  
> >  purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
> > +purgatory-y += strcmp.o strlen.o strncmp.o
> >  
> >  targets += $(purgatory-y)
> >  PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
> > @@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
> >  $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
> >  	$(call if_changed_rule,as_o_S)
> >  
> > +$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> > +$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> > +$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> >  $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
> >  	$(call if_changed_rule,cc_o_c)
> >  
> > @@ -77,6 +87,9 @@ CFLAGS_ctype.o			+= $(PURGATORY_CFLAGS)
> >  AFLAGS_REMOVE_entry.o		+= -Wa,-gdwarf-2
> >  AFLAGS_REMOVE_memcpy.o		+= -Wa,-gdwarf-2
> >  AFLAGS_REMOVE_memset.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strcmp.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strlen.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strncmp.o		+= -Wa,-gdwarf-2
> >  
> >  $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
> >  		$(call if_changed,ld)
>
Andrew Jones Jan. 10, 2023, 9:39 a.m. UTC | #3
On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Depending on supported extensions on specific RISC-V cores,
> optimized str* functions might make sense.
> 
> This adds basic infrastructure to allow patching the function calls
> via alternatives later on.
> 
> The Linux kernel provides standard implementations for string functions
> but when architectures want to extend them, they need to provide their
> own.

And the compiler provides builtins. In the previous series it appeared
to be a bad idea to compile the kernel with the compiler's builtins
disabled. How will the optimized string functions which will be based
on this patch be selected?

Thanks,
drew
Heiko Stuebner Jan. 10, 2023, 10:46 a.m. UTC | #4
Hi Andrew,

Am Dienstag, 10. Januar 2023, 10:39:36 CET schrieb Andrew Jones:
> On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > 
> > Depending on supported extensions on specific RISC-V cores,
> > optimized str* functions might make sense.
> > 
> > This adds basic infrastructure to allow patching the function calls
> > via alternatives later on.
> > 
> > The Linux kernel provides standard implementations for string functions
> > but when architectures want to extend them, they need to provide their
> > own.
> 
> And the compiler provides builtins. In the previous series it appeared
> to be a bad idea to compile the kernel with the compiler's builtins
> disabled. How will the optimized string functions which will be based
> on this patch be selected?

yep, the consensus seemingly was that the compiler knows best when
to use builtins for some cases (which is probably correct), hence the move
away from the inline bases.

So I guess the first decision is the compiler's wether to use a builtin or
the kernel string function (same as for mem*) .

In my tests, I did see both getting used - so it's definitly not lost work :-) .

After that when landing in these here, we want to select the best variant
for the host-system the kernel runs on.

I.e. this one as baseline or for example using zbb as an "alternative".

As for the "more" variants, I currently have more patches on top, that
then use an ALTERNATIVE_2

ALTERNATIVE_2("nop",
      "j variant_zbb_unaligned", 0, CPUFEATURE_ZBB | CPUFEATURE_FAST_UNALIGNED, 0, CONFIG_RISCV_ISA_ZBB,
      "j variant_zbb", 0, CPUFEATURE_ZBB, CPUFEATURE_FAST_UNALIGNED, CONFIG_RISCV_ISA_ZBB)

with the "errata_id" being used as a bitfield to use extension combinations.
And a "not"-field, so I can do a has-zbb + has-not-fast-unaligned


Heiko
Andrew Jones Jan. 10, 2023, 11:16 a.m. UTC | #5
On Tue, Jan 10, 2023 at 11:46:40AM +0100, Heiko Stübner wrote:
> Hi Andrew,
> 
> Am Dienstag, 10. Januar 2023, 10:39:36 CET schrieb Andrew Jones:
> > On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > > 
> > > Depending on supported extensions on specific RISC-V cores,
> > > optimized str* functions might make sense.
> > > 
> > > This adds basic infrastructure to allow patching the function calls
> > > via alternatives later on.
> > > 
> > > The Linux kernel provides standard implementations for string functions
> > > but when architectures want to extend them, they need to provide their
> > > own.
> > 
> > And the compiler provides builtins. In the previous series it appeared
> > to be a bad idea to compile the kernel with the compiler's builtins
> > disabled. How will the optimized string functions which will be based
> > on this patch be selected?
> 
> yep, the consensus seemingly was that the compiler knows best when
> to use builtins for some cases (which is probably correct), hence the move
> away from the inline bases.
> 
> So I guess the first decision is the compiler's wether to use a builtin or
> the kernel string function (same as for mem*) .
> 
> In my tests, I did see both getting used - so it's definitly not lost work :-) .
> 
> After that when landing in these here, we want to select the best variant
> for the host-system the kernel runs on.
> 
> I.e. this one as baseline or for example using zbb as an "alternative".
> 
> As for the "more" variants, I currently have more patches on top, that
> then use an ALTERNATIVE_2
> 
> ALTERNATIVE_2("nop",
>       "j variant_zbb_unaligned", 0, CPUFEATURE_ZBB | CPUFEATURE_FAST_UNALIGNED, 0, CONFIG_RISCV_ISA_ZBB,
>       "j variant_zbb", 0, CPUFEATURE_ZBB, CPUFEATURE_FAST_UNALIGNED, CONFIG_RISCV_ISA_ZBB)
> 
> with the "errata_id" being used as a bitfield to use extension combinations.
> And a "not"-field, so I can do a has-zbb + has-not-fast-unaligned

Thanks, Heiko. This was the info I needed. It might be nice to put some of
it in the commit message too.

drew
Andrew Jones Jan. 10, 2023, 12:13 p.m. UTC | #6
On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Depending on supported extensions on specific RISC-V cores,
> optimized str* functions might make sense.
> 
> This adds basic infrastructure to allow patching the function calls
> via alternatives later on.
> 
> The Linux kernel provides standard implementations for string functions
> but when architectures want to extend them, they need to provide their
> own.
> 
> The added generic string functions are done in assembler (taken from
> disassembling the main-kernel functions for now) to allow us to control
> the used registers and extend them with optimized variants.
> 
> Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
> ---
>  arch/riscv/include/asm/string.h | 10 +++++++++
>  arch/riscv/kernel/riscv_ksyms.c |  3 +++
>  arch/riscv/lib/Makefile         |  3 +++
>  arch/riscv/lib/strcmp.S         | 37 ++++++++++++++++++++++++++++++
>  arch/riscv/lib/strlen.S         | 28 +++++++++++++++++++++++
>  arch/riscv/lib/strncmp.S        | 40 +++++++++++++++++++++++++++++++++
>  arch/riscv/purgatory/Makefile   | 13 +++++++++++
>  7 files changed, 134 insertions(+)
>  create mode 100644 arch/riscv/lib/strcmp.S
>  create mode 100644 arch/riscv/lib/strlen.S
>  create mode 100644 arch/riscv/lib/strncmp.S
> 
> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> index 909049366555..a96b1fea24fe 100644
> --- a/arch/riscv/include/asm/string.h
> +++ b/arch/riscv/include/asm/string.h
> @@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>  #define __HAVE_ARCH_MEMMOVE
>  extern asmlinkage void *memmove(void *, const void *, size_t);
>  extern asmlinkage void *__memmove(void *, const void *, size_t);
> +
> +#define __HAVE_ARCH_STRCMP
> +extern asmlinkage int strcmp(const char *cs, const char *ct);
> +
> +#define __HAVE_ARCH_STRLEN
> +extern asmlinkage __kernel_size_t strlen(const char *);
> +
> +#define __HAVE_ARCH_STRNCMP
> +extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
> +
>  /* For those files which don't want to check by kasan. */
>  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>  #define memcpy(dst, src, len) __memcpy(dst, src, len)
> diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
> index 5ab1c7e1a6ed..a72879b4249a 100644
> --- a/arch/riscv/kernel/riscv_ksyms.c
> +++ b/arch/riscv/kernel/riscv_ksyms.c
> @@ -12,6 +12,9 @@
>  EXPORT_SYMBOL(memset);
>  EXPORT_SYMBOL(memcpy);
>  EXPORT_SYMBOL(memmove);
> +EXPORT_SYMBOL(strcmp);
> +EXPORT_SYMBOL(strlen);
> +EXPORT_SYMBOL(strncmp);
>  EXPORT_SYMBOL(__memset);
>  EXPORT_SYMBOL(__memcpy);
>  EXPORT_SYMBOL(__memmove);
> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> index 25d5c9664e57..6c74b0bedd60 100644
> --- a/arch/riscv/lib/Makefile
> +++ b/arch/riscv/lib/Makefile
> @@ -3,6 +3,9 @@ lib-y			+= delay.o
>  lib-y			+= memcpy.o
>  lib-y			+= memset.o
>  lib-y			+= memmove.o
> +lib-y			+= strcmp.o
> +lib-y			+= strlen.o
> +lib-y			+= strncmp.o
>  lib-$(CONFIG_MMU)	+= uaccess.o
>  lib-$(CONFIG_64BIT)	+= tishift.o
>  
> diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> new file mode 100644
> index 000000000000..94440fb8390c
> --- /dev/null
> +++ b/arch/riscv/lib/strcmp.S
> @@ -0,0 +1,37 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strcmp(const char *cs, const char *ct) */
> +SYM_FUNC_START(strcmp)
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, value like strcmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2
> +	 */
> +	mv	t2, a1

The above instruction and the 'mv a1, t2' below appear to be attempting
to preserve a1, but that shouldn't be necessary.

> +1:
> +	lbu	t1, 0(a0)
> +	lbu	t0, 0(a1)

I'd rather have t0 be 0(a0) and t1 be 0(a1)

> +	addi	a0, a0, 1
> +	addi	a1, a1, 1
> +	beq	t1, t0, 3f
> +	li	a0, 1
> +	bgeu	t1, t0, 2f
> +	li	a0, -1
> +2:
> +	mv	a1, t2
> +	ret
> +3:
> +	bnez	t1, 1b
> +	li	a0, 0
> +	j	2b

For fun I removed one conditional and one unconditional branch (untested)

1:
     lbu     t0, 0(a0)
     lbu     t1, 0(a1)
     addi    a0, a0, 1
     addi    a1, a1, 1
     bne     t0, t1, 2f
     bnez    t0, 1b
     li      a0, 0
     ret
2:
     slt     a1, t1, t0
     slli    a1, a1, 1
     li      a0, -1
     add     a0, a0, a1
     ret


> +SYM_FUNC_END(strcmp)
> diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> new file mode 100644
> index 000000000000..09a7aaff26c8
> --- /dev/null
> +++ b/arch/riscv/lib/strlen.S
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strlen(const char *s) */
> +SYM_FUNC_START(strlen)
> +	/*
> +	 * Returns
> +	 *   a0 - string length
> +	 *
> +	 * Parameters
> +	 *   a0 - String to measure
> +	 *
> +	 * Clobbers:
> +	 *   t0, t1
> +	 */
> +	mv	t1, a0
> +1:
> +	lbu	t0, 0(t1)
> +	bnez	t0, 2f
> +	sub	a0, t1, a0
> +	ret
> +2:
> +	addi	t1, t1, 1
> +	j	1b

Slightly reorganizing looks better (to me)

   mv    t1, a0
1:
   lbu   t0, 0(t1)
   beqz  t0, 2f
   addi  t1, t1, 1
   j     1b
2:
   sub a0, t1, a0
   ret

> +SYM_FUNC_END(strlen)
> diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> new file mode 100644
> index 000000000000..493ab6febcb2
> --- /dev/null
> +++ b/arch/riscv/lib/strncmp.S
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm-generic/export.h>
> +
> +/* int strncmp(const char *cs, const char *ct, size_t count) */
> +SYM_FUNC_START(strncmp)
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, value like strncmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *   a2 - number of characters to compare
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2
> +	 */
> +	li	t0, 0
> +1:
> +	beq	a2, t0, 4f
> +	add	t1, a0, t0
> +	add	t2, a1, t0
> +	lbu	t1, 0(t1)
> +	lbu	t2, 0(t2)
> +	beq	t1, t2, 3f
> +	li	a0, 1
> +	bgeu	t1, t2, 2f
> +	li	a0, -1
> +2:
> +	ret
> +3:
> +	addi	t0, t0, 1
> +	bnez	t1, 1b
> +4:
> +	li	a0, 0
> +	j	2b

(untested)

     li      t2, 0
1:
     beq     a2, t2, 2f
     lbu     t0, 0(a0)
     lbu     t1, 0(a1)
     addi    a0, a0, 1
     addi    a1, a1, 1
     bne     t0, t1, 3f
     addi    t2, t2, 1
     bnez    t0, 1b
2:
     li      a0, 0
     ret
3:
     slt     a1, t1, t0
     slli    a1, a1, 1
     li      a0, -1
     add     a0, a0, a1
     ret

> +SYM_FUNC_END(strncmp)
> diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
> index dd58e1d99397..d16bf715a586 100644
> --- a/arch/riscv/purgatory/Makefile
> +++ b/arch/riscv/purgatory/Makefile
> @@ -2,6 +2,7 @@
>  OBJECT_FILES_NON_STANDARD := y
>  
>  purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
> +purgatory-y += strcmp.o strlen.o strncmp.o
>  
>  targets += $(purgatory-y)
>  PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
> @@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
>  $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
>  	$(call if_changed_rule,as_o_S)
>  
> +$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
> +$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
> +$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
> +	$(call if_changed_rule,as_o_S)
> +
>  $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
>  	$(call if_changed_rule,cc_o_c)
>  
> @@ -77,6 +87,9 @@ CFLAGS_ctype.o			+= $(PURGATORY_CFLAGS)
>  AFLAGS_REMOVE_entry.o		+= -Wa,-gdwarf-2
>  AFLAGS_REMOVE_memcpy.o		+= -Wa,-gdwarf-2
>  AFLAGS_REMOVE_memset.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strcmp.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strlen.o		+= -Wa,-gdwarf-2
> +AFLAGS_REMOVE_strncmp.o		+= -Wa,-gdwarf-2
>  
>  $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
>  		$(call if_changed,ld)
> -- 
> 2.35.1
>

With at least the removal of the unnecessary preserving of a1 in strcmp,
then it looks correct to me, so

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>

But I think there's room for making it more readable, and maybe even
optimized, as I've tried to do.

Thanks,
drew
Andrew Jones Jan. 11, 2023, 12:30 p.m. UTC | #7
On Tue, Jan 10, 2023 at 01:13:20PM +0100, Andrew Jones wrote:
> On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
...
> > +
> > +/* int strcmp(const char *cs, const char *ct) */
> > +SYM_FUNC_START(strcmp)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - comparison result, value like strcmp
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - string1
> > +	 *   a1 - string2
> > +	 *
> > +	 * Clobbers
> > +	 *   t0, t1, t2
> > +	 */
> > +	mv	t2, a1
> 
> The above instruction and the 'mv a1, t2' below appear to be attempting
> to preserve a1, but that shouldn't be necessary.
> 
> > +1:
> > +	lbu	t1, 0(a0)
> > +	lbu	t0, 0(a1)
> 
> I'd rather have t0 be 0(a0) and t1 be 0(a1)
> 
> > +	addi	a0, a0, 1
> > +	addi	a1, a1, 1
> > +	beq	t1, t0, 3f
> > +	li	a0, 1
> > +	bgeu	t1, t0, 2f
> > +	li	a0, -1
> > +2:
> > +	mv	a1, t2
> > +	ret
> > +3:
> > +	bnez	t1, 1b
> > +	li	a0, 0
> > +	j	2b
> 
> For fun I removed one conditional and one unconditional branch (untested)
> 
> 1:
>      lbu     t0, 0(a0)
>      lbu     t1, 0(a1)
>      addi    a0, a0, 1
>      addi    a1, a1, 1
>      bne     t0, t1, 2f
>      bnez    t0, 1b
>      li      a0, 0
>      ret
> 2:
>      slt     a1, t1, t0
>      slli    a1, a1, 1
>      li      a0, -1
>      add     a0, a0, a1

If we just need < 0 and > 0 then the above four instructions can become

       sub     a0, t0, t1

>      ret
>

Similar comment for strncmp.

Thanks,
drew
Andrew Jones Jan. 11, 2023, 12:34 p.m. UTC | #8
On Tue, Jan 10, 2023 at 12:16:47PM +0100, Andrew Jones wrote:
> On Tue, Jan 10, 2023 at 11:46:40AM +0100, Heiko Stübner wrote:
> > Hi Andrew,
> > 
> > Am Dienstag, 10. Januar 2023, 10:39:36 CET schrieb Andrew Jones:
> > > On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > > > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > > > 
> > > > Depending on supported extensions on specific RISC-V cores,
> > > > optimized str* functions might make sense.
> > > > 
> > > > This adds basic infrastructure to allow patching the function calls
> > > > via alternatives later on.
> > > > 
> > > > The Linux kernel provides standard implementations for string functions
> > > > but when architectures want to extend them, they need to provide their
> > > > own.
> > > 
> > > And the compiler provides builtins. In the previous series it appeared
> > > to be a bad idea to compile the kernel with the compiler's builtins
> > > disabled. How will the optimized string functions which will be based
> > > on this patch be selected?
> > 
> > yep, the consensus seemingly was that the compiler knows best when
> > to use builtins for some cases (which is probably correct), hence the move
> > away from the inline bases.
> > 
> > So I guess the first decision is the compiler's wether to use a builtin or
> > the kernel string function (same as for mem*) .

Hi Heiko,

Thinking about this some more, I'd still like to understand how/when the
compiler makes this choice. Are compiler flags involved? Or do some
heuristics dictate the choice?

Thanks,
drew
Philipp Tomsich Jan. 11, 2023, 1:42 p.m. UTC | #9
Drew,

the reason why alignment is critical for correctness of optimisations
in userspace applications is that exception semantics must be retained
when crossing page boundaries.

Philipp.

On Wed, 11 Jan 2023 at 14:39, Christoph Müllner
<christoph.muellner@vrull.eu> wrote:
>
>
>
> On Wed, Jan 11, 2023 at 1:34 PM Andrew Jones <ajones@ventanamicro.com> wrote:
>>
>> On Tue, Jan 10, 2023 at 12:16:47PM +0100, Andrew Jones wrote:
>> > On Tue, Jan 10, 2023 at 11:46:40AM +0100, Heiko Stübner wrote:
>> > > Hi Andrew,
>> > >
>> > > Am Dienstag, 10. Januar 2023, 10:39:36 CET schrieb Andrew Jones:
>> > > > On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
>> > > > > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
>> > > > >
>> > > > > Depending on supported extensions on specific RISC-V cores,
>> > > > > optimized str* functions might make sense.
>> > > > >
>> > > > > This adds basic infrastructure to allow patching the function calls
>> > > > > via alternatives later on.
>> > > > >
>> > > > > The Linux kernel provides standard implementations for string functions
>> > > > > but when architectures want to extend them, they need to provide their
>> > > > > own.
>> > > >
>> > > > And the compiler provides builtins. In the previous series it appeared
>> > > > to be a bad idea to compile the kernel with the compiler's builtins
>> > > > disabled. How will the optimized string functions which will be based
>> > > > on this patch be selected?
>> > >
>> > > yep, the consensus seemingly was that the compiler knows best when
>> > > to use builtins for some cases (which is probably correct), hence the move
>> > > away from the inline bases.
>> > >
>> > > So I guess the first decision is the compiler's wether to use a builtin or
>> > > the kernel string function (same as for mem*) .
>>
>> Hi Heiko,
>>
>> Thinking about this some more, I'd still like to understand how/when the
>> compiler makes this choice. Are compiler flags involved? Or do some
>> heuristics dictate the choice?
>
>
> 1) Arch-independent expansion is used e.g. for calculating the result for statically known strings.
> 2) Arch-specific expansion (not mainline, but my series can be found here: [1]) can do whatever is reasonable.
> My series emits a bounded unrolled comparison loop for str(n)cmp with a bound that can be adjusted via flag.
> But it only triggers if both strings are known to be aligned.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605993.html
>
> BR
> Christoph
>
>>
>> Thanks,
>> drew
Andrew Jones Jan. 11, 2023, 1:47 p.m. UTC | #10
On Wed, Jan 11, 2023 at 02:39:36PM +0100, Christoph Müllner wrote:
> On Wed, Jan 11, 2023 at 1:34 PM Andrew Jones <ajones@ventanamicro.com>
> wrote:
> 
> > On Tue, Jan 10, 2023 at 12:16:47PM +0100, Andrew Jones wrote:
> > > On Tue, Jan 10, 2023 at 11:46:40AM +0100, Heiko Stübner wrote:
> > > > Hi Andrew,
> > > >
> > > > Am Dienstag, 10. Januar 2023, 10:39:36 CET schrieb Andrew Jones:
> > > > > On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > > > > > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > > > > >
> > > > > > Depending on supported extensions on specific RISC-V cores,
> > > > > > optimized str* functions might make sense.
> > > > > >
> > > > > > This adds basic infrastructure to allow patching the function calls
> > > > > > via alternatives later on.
> > > > > >
> > > > > > The Linux kernel provides standard implementations for string
> > functions
> > > > > > but when architectures want to extend them, they need to provide
> > their
> > > > > > own.
> > > > >
> > > > > And the compiler provides builtins. In the previous series it
> > appeared
> > > > > to be a bad idea to compile the kernel with the compiler's builtins
> > > > > disabled. How will the optimized string functions which will be based
> > > > > on this patch be selected?
> > > >
> > > > yep, the consensus seemingly was that the compiler knows best when
> > > > to use builtins for some cases (which is probably correct), hence the
> > move
> > > > away from the inline bases.
> > > >
> > > > So I guess the first decision is the compiler's wether to use a
> > builtin or
> > > > the kernel string function (same as for mem*) .
> >
> > Hi Heiko,
> >
> > Thinking about this some more, I'd still like to understand how/when the
> > compiler makes this choice. Are compiler flags involved? Or do some
> > heuristics dictate the choice?
> >
> 
> 1) Arch-independent expansion is used e.g. for calculating the result for
> statically known strings.
> 2) Arch-specific expansion (not mainline, but my series can be found here:
> [1]) can do whatever is reasonable.
> My series emits a bounded unrolled comparison loop for str(n)cmp with a
> bound that can be adjusted via flag.
> But it only triggers if both strings are known to be aligned.
> 
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605993.html
> 

Thanks Christoph!

drew
Heiko Stuebner Jan. 12, 2023, 4:05 p.m. UTC | #11
Hi Andrew,

thanks a lot for taking the time to provide these extensive and really
helpful comments.

Am Dienstag, 10. Januar 2023, 13:13:20 CET schrieb Andrew Jones:
> On Mon, Jan 09, 2023 at 07:17:54PM +0100, Heiko Stuebner wrote:
> > From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > 
> > Depending on supported extensions on specific RISC-V cores,
> > optimized str* functions might make sense.
> > 
> > This adds basic infrastructure to allow patching the function calls
> > via alternatives later on.
> > 
> > The Linux kernel provides standard implementations for string functions
> > but when architectures want to extend them, they need to provide their
> > own.
> > 
> > The added generic string functions are done in assembler (taken from
> > disassembling the main-kernel functions for now) to allow us to control
> > the used registers and extend them with optimized variants.
> > 
> > Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
> > ---
> >  arch/riscv/include/asm/string.h | 10 +++++++++
> >  arch/riscv/kernel/riscv_ksyms.c |  3 +++
> >  arch/riscv/lib/Makefile         |  3 +++
> >  arch/riscv/lib/strcmp.S         | 37 ++++++++++++++++++++++++++++++
> >  arch/riscv/lib/strlen.S         | 28 +++++++++++++++++++++++
> >  arch/riscv/lib/strncmp.S        | 40 +++++++++++++++++++++++++++++++++
> >  arch/riscv/purgatory/Makefile   | 13 +++++++++++
> >  7 files changed, 134 insertions(+)
> >  create mode 100644 arch/riscv/lib/strcmp.S
> >  create mode 100644 arch/riscv/lib/strlen.S
> >  create mode 100644 arch/riscv/lib/strncmp.S
> > 
> > diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> > index 909049366555..a96b1fea24fe 100644
> > --- a/arch/riscv/include/asm/string.h
> > +++ b/arch/riscv/include/asm/string.h
> > @@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
> >  #define __HAVE_ARCH_MEMMOVE
> >  extern asmlinkage void *memmove(void *, const void *, size_t);
> >  extern asmlinkage void *__memmove(void *, const void *, size_t);
> > +
> > +#define __HAVE_ARCH_STRCMP
> > +extern asmlinkage int strcmp(const char *cs, const char *ct);
> > +
> > +#define __HAVE_ARCH_STRLEN
> > +extern asmlinkage __kernel_size_t strlen(const char *);
> > +
> > +#define __HAVE_ARCH_STRNCMP
> > +extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
> > +
> >  /* For those files which don't want to check by kasan. */
> >  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
> >  #define memcpy(dst, src, len) __memcpy(dst, src, len)
> > diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
> > index 5ab1c7e1a6ed..a72879b4249a 100644
> > --- a/arch/riscv/kernel/riscv_ksyms.c
> > +++ b/arch/riscv/kernel/riscv_ksyms.c
> > @@ -12,6 +12,9 @@
> >  EXPORT_SYMBOL(memset);
> >  EXPORT_SYMBOL(memcpy);
> >  EXPORT_SYMBOL(memmove);
> > +EXPORT_SYMBOL(strcmp);
> > +EXPORT_SYMBOL(strlen);
> > +EXPORT_SYMBOL(strncmp);
> >  EXPORT_SYMBOL(__memset);
> >  EXPORT_SYMBOL(__memcpy);
> >  EXPORT_SYMBOL(__memmove);
> > diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> > index 25d5c9664e57..6c74b0bedd60 100644
> > --- a/arch/riscv/lib/Makefile
> > +++ b/arch/riscv/lib/Makefile
> > @@ -3,6 +3,9 @@ lib-y			+= delay.o
> >  lib-y			+= memcpy.o
> >  lib-y			+= memset.o
> >  lib-y			+= memmove.o
> > +lib-y			+= strcmp.o
> > +lib-y			+= strlen.o
> > +lib-y			+= strncmp.o
> >  lib-$(CONFIG_MMU)	+= uaccess.o
> >  lib-$(CONFIG_64BIT)	+= tishift.o
> >  
> > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> > new file mode 100644
> > index 000000000000..94440fb8390c
> > --- /dev/null
> > +++ b/arch/riscv/lib/strcmp.S
> > @@ -0,0 +1,37 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strcmp(const char *cs, const char *ct) */
> > +SYM_FUNC_START(strcmp)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - comparison result, value like strcmp
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - string1
> > +	 *   a1 - string2
> > +	 *
> > +	 * Clobbers
> > +	 *   t0, t1, t2
> > +	 */
> > +	mv	t2, a1
> 
> The above instruction and the 'mv a1, t2' below appear to be attempting
> to preserve a1, but that shouldn't be necessary.

correct and gone now

> > +1:
> > +	lbu	t1, 0(a0)
> > +	lbu	t0, 0(a1)
> 
> I'd rather have t0 be 0(a0) and t1 be 0(a1)

ok

> > +	addi	a0, a0, 1
> > +	addi	a1, a1, 1
> > +	beq	t1, t0, 3f
> > +	li	a0, 1
> > +	bgeu	t1, t0, 2f
> > +	li	a0, -1
> > +2:
> > +	mv	a1, t2
> > +	ret
> > +3:
> > +	bnez	t1, 1b
> > +	li	a0, 0
> > +	j	2b
> 
> For fun I removed one conditional and one unconditional branch (untested)
> 
> 1:
>      lbu     t0, 0(a0)
>      lbu     t1, 0(a1)
>      addi    a0, a0, 1
>      addi    a1, a1, 1
>      bne     t0, t1, 2f
>      bnez    t0, 1b
>      li      a0, 0
>      ret
> 2:
>      slt     a1, t1, t0
>      slli    a1, a1, 1
>      li      a0, -1
>      add     a0, a0, a1
>      ret

yep that
- looks correct
- and also seems to produce correct results

also including your
	  sub     a0, t0, t1
comment from the followup that then produces the same result als the
zbb-variant.

And I've also verified the
- 0, if the s1 and s2 are equal;
- a negative value if s1 is less than s2;
- a positive value if s1 is greater than s2.
return value calling convention with documentation

And added a comment above it, pointing out this fact for the next
person stumbling over this :-)


> > +SYM_FUNC_END(strcmp)
> > diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> > new file mode 100644
> > index 000000000000..09a7aaff26c8
> > --- /dev/null
> > +++ b/arch/riscv/lib/strlen.S
> > @@ -0,0 +1,28 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strlen(const char *s) */
> > +SYM_FUNC_START(strlen)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - string length
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - String to measure
> > +	 *
> > +	 * Clobbers:
> > +	 *   t0, t1
> > +	 */
> > +	mv	t1, a0
> > +1:
> > +	lbu	t0, 0(t1)
> > +	bnez	t0, 2f
> > +	sub	a0, t1, a0
> > +	ret
> > +2:
> > +	addi	t1, t1, 1
> > +	j	1b
> 
> Slightly reorganizing looks better (to me)
> 
>    mv    t1, a0
> 1:
>    lbu   t0, 0(t1)
>    beqz  t0, 2f
>    addi  t1, t1, 1
>    j     1b
> 2:
>    sub a0, t1, a0
>    ret

ok


> > +SYM_FUNC_END(strlen)
> > diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> > new file mode 100644
> > index 000000000000..493ab6febcb2
> > --- /dev/null
> > +++ b/arch/riscv/lib/strncmp.S
> > @@ -0,0 +1,40 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/asm.h>
> > +#include <asm-generic/export.h>
> > +
> > +/* int strncmp(const char *cs, const char *ct, size_t count) */
> > +SYM_FUNC_START(strncmp)
> > +	/*
> > +	 * Returns
> > +	 *   a0 - comparison result, value like strncmp
> > +	 *
> > +	 * Parameters
> > +	 *   a0 - string1
> > +	 *   a1 - string2
> > +	 *   a2 - number of characters to compare
> > +	 *
> > +	 * Clobbers
> > +	 *   t0, t1, t2
> > +	 */
> > +	li	t0, 0
> > +1:
> > +	beq	a2, t0, 4f
> > +	add	t1, a0, t0
> > +	add	t2, a1, t0
> > +	lbu	t1, 0(t1)
> > +	lbu	t2, 0(t2)
> > +	beq	t1, t2, 3f
> > +	li	a0, 1
> > +	bgeu	t1, t2, 2f
> > +	li	a0, -1
> > +2:
> > +	ret
> > +3:
> > +	addi	t0, t0, 1
> > +	bnez	t1, 1b
> > +4:
> > +	li	a0, 0
> > +	j	2b
> 
> (untested)
> 
>      li      t2, 0
> 1:
>      beq     a2, t2, 2f
>      lbu     t0, 0(a0)
>      lbu     t1, 0(a1)
>      addi    a0, a0, 1
>      addi    a1, a1, 1
>      bne     t0, t1, 3f
>      addi    t2, t2, 1
>      bnez    t0, 1b
> 2:
>      li      a0, 0
>      ret
> 3:
>      slt     a1, t1, t0
>      slli    a1, a1, 1
>      li      a0, -1
>      add     a0, a0, a1
>      ret

same here, I did go over the changed assembly and verified that
it produces the same results as the original and then did a second
pass to also add the  
	sub     a0, t0, t1
replacment.


> > +SYM_FUNC_END(strncmp)
> > diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
> > index dd58e1d99397..d16bf715a586 100644
> > --- a/arch/riscv/purgatory/Makefile
> > +++ b/arch/riscv/purgatory/Makefile
> > @@ -2,6 +2,7 @@
> >  OBJECT_FILES_NON_STANDARD := y
> >  
> >  purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
> > +purgatory-y += strcmp.o strlen.o strncmp.o
> >  
> >  targets += $(purgatory-y)
> >  PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
> > @@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
> >  $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
> >  	$(call if_changed_rule,as_o_S)
> >  
> > +$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> > +$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> > +$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
> > +	$(call if_changed_rule,as_o_S)
> > +
> >  $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
> >  	$(call if_changed_rule,cc_o_c)
> >  
> > @@ -77,6 +87,9 @@ CFLAGS_ctype.o			+= $(PURGATORY_CFLAGS)
> >  AFLAGS_REMOVE_entry.o		+= -Wa,-gdwarf-2
> >  AFLAGS_REMOVE_memcpy.o		+= -Wa,-gdwarf-2
> >  AFLAGS_REMOVE_memset.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strcmp.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strlen.o		+= -Wa,-gdwarf-2
> > +AFLAGS_REMOVE_strncmp.o		+= -Wa,-gdwarf-2
> >  
> >  $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
> >  		$(call if_changed,ld)
> >
> 
> With at least the removal of the unnecessary preserving of a1 in strcmp,
> then it looks correct to me, so
> 
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
> 
> But I think there's room for making it more readable, and maybe even
> optimized, as I've tried to do.


Thanks a lot
Heiko
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..a96b1fea24fe 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -18,6 +18,16 @@  extern asmlinkage void *__memcpy(void *, const void *, size_t);
 #define __HAVE_ARCH_MEMMOVE
 extern asmlinkage void *memmove(void *, const void *, size_t);
 extern asmlinkage void *__memmove(void *, const void *, size_t);
+
+#define __HAVE_ARCH_STRCMP
+extern asmlinkage int strcmp(const char *cs, const char *ct);
+
+#define __HAVE_ARCH_STRLEN
+extern asmlinkage __kernel_size_t strlen(const char *);
+
+#define __HAVE_ARCH_STRNCMP
+extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
+
 /* For those files which don't want to check by kasan. */
 #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index 5ab1c7e1a6ed..a72879b4249a 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -12,6 +12,9 @@ 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(__memset);
 EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..6c74b0bedd60 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -3,6 +3,9 @@  lib-y			+= delay.o
 lib-y			+= memcpy.o
 lib-y			+= memset.o
 lib-y			+= memmove.o
+lib-y			+= strcmp.o
+lib-y			+= strlen.o
+lib-y			+= strncmp.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 
diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
new file mode 100644
index 000000000000..94440fb8390c
--- /dev/null
+++ b/arch/riscv/lib/strcmp.S
@@ -0,0 +1,37 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int strcmp(const char *cs, const char *ct) */
+SYM_FUNC_START(strcmp)
+	/*
+	 * Returns
+	 *   a0 - comparison result, value like strcmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *
+	 * Clobbers
+	 *   t0, t1, t2
+	 */
+	mv	t2, a1
+1:
+	lbu	t1, 0(a0)
+	lbu	t0, 0(a1)
+	addi	a0, a0, 1
+	addi	a1, a1, 1
+	beq	t1, t0, 3f
+	li	a0, 1
+	bgeu	t1, t0, 2f
+	li	a0, -1
+2:
+	mv	a1, t2
+	ret
+3:
+	bnez	t1, 1b
+	li	a0, 0
+	j	2b
+SYM_FUNC_END(strcmp)
diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
new file mode 100644
index 000000000000..09a7aaff26c8
--- /dev/null
+++ b/arch/riscv/lib/strlen.S
@@ -0,0 +1,28 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int strlen(const char *s) */
+SYM_FUNC_START(strlen)
+	/*
+	 * Returns
+	 *   a0 - string length
+	 *
+	 * Parameters
+	 *   a0 - String to measure
+	 *
+	 * Clobbers:
+	 *   t0, t1
+	 */
+	mv	t1, a0
+1:
+	lbu	t0, 0(t1)
+	bnez	t0, 2f
+	sub	a0, t1, a0
+	ret
+2:
+	addi	t1, t1, 1
+	j	1b
+SYM_FUNC_END(strlen)
diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
new file mode 100644
index 000000000000..493ab6febcb2
--- /dev/null
+++ b/arch/riscv/lib/strncmp.S
@@ -0,0 +1,40 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+
+/* int strncmp(const char *cs, const char *ct, size_t count) */
+SYM_FUNC_START(strncmp)
+	/*
+	 * Returns
+	 *   a0 - comparison result, value like strncmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *   a2 - number of characters to compare
+	 *
+	 * Clobbers
+	 *   t0, t1, t2
+	 */
+	li	t0, 0
+1:
+	beq	a2, t0, 4f
+	add	t1, a0, t0
+	add	t2, a1, t0
+	lbu	t1, 0(t1)
+	lbu	t2, 0(t2)
+	beq	t1, t2, 3f
+	li	a0, 1
+	bgeu	t1, t2, 2f
+	li	a0, -1
+2:
+	ret
+3:
+	addi	t0, t0, 1
+	bnez	t1, 1b
+4:
+	li	a0, 0
+	j	2b
+SYM_FUNC_END(strncmp)
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index dd58e1d99397..d16bf715a586 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -2,6 +2,7 @@ 
 OBJECT_FILES_NON_STANDARD := y
 
 purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
+purgatory-y += strcmp.o strlen.o strncmp.o
 
 targets += $(purgatory-y)
 PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
@@ -18,6 +19,15 @@  $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
 $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
 	$(call if_changed_rule,as_o_S)
 
+$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
+	$(call if_changed_rule,as_o_S)
+
 $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
@@ -77,6 +87,9 @@  CFLAGS_ctype.o			+= $(PURGATORY_CFLAGS)
 AFLAGS_REMOVE_entry.o		+= -Wa,-gdwarf-2
 AFLAGS_REMOVE_memcpy.o		+= -Wa,-gdwarf-2
 AFLAGS_REMOVE_memset.o		+= -Wa,-gdwarf-2
+AFLAGS_REMOVE_strcmp.o		+= -Wa,-gdwarf-2
+AFLAGS_REMOVE_strlen.o		+= -Wa,-gdwarf-2
+AFLAGS_REMOVE_strncmp.o		+= -Wa,-gdwarf-2
 
 $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
 		$(call if_changed,ld)