diff mbox series

[v5,2/2] RISC-V: add zbb support to string functions

Message ID 20230113212301.3534711-3-heiko@sntech.de (mailing list archive)
State Accepted
Commit b6fcdb191e36f82336f9b5e126d51c02e7323480
Delegated to: Palmer Dabbelt
Headers show
Series Zbb string optimizations | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes or riscv/for-next

Commit Message

Heiko Stuebner Jan. 13, 2023, 9:23 p.m. UTC
From: Heiko Stuebner <heiko.stuebner@vrull.eu>

Add handling for ZBB extension and add support for using it as a
variant for optimized string functions.

Support for the Zbb-str-variants is limited to the GNU-assembler
for now, as LLVM has not yet acquired the functionality to
selectively change the arch option in assembler code.
This is still under review at
    https://reviews.llvm.org/D123515

Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu>
Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu>
Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
---
 arch/riscv/Kconfig                   |  24 ++++++
 arch/riscv/include/asm/errata_list.h |   3 +-
 arch/riscv/include/asm/hwcap.h       |   1 +
 arch/riscv/kernel/cpu.c              |   1 +
 arch/riscv/kernel/cpufeature.c       |  18 +++++
 arch/riscv/lib/strcmp.S              |  85 ++++++++++++++++++++++
 arch/riscv/lib/strlen.S              | 105 +++++++++++++++++++++++++++
 arch/riscv/lib/strncmp.S             |  98 +++++++++++++++++++++++++
 8 files changed, 334 insertions(+), 1 deletion(-)

Comments

Conor Dooley Jan. 13, 2023, 10:33 p.m. UTC | #1
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Add handling for ZBB extension and add support for using it as a
> variant for optimized string functions.
> 
> Support for the Zbb-str-variants is limited to the GNU-assembler
> for now, as LLVM has not yet acquired the functionality to
> selectively change the arch option in assembler code.
> This is still under review at
>     https://reviews.llvm.org/D123515
> 
> Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>

Had a look through, again mostly examining b4 diff...
The changes look to match the suggestions on v4, but I was happy with
the un-optimised version so I have no taste.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>

Thanks,
Conor.
Andrew Jones Jan. 17, 2023, 12:24 p.m. UTC | #2
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Add handling for ZBB extension and add support for using it as a
> variant for optimized string functions.
> 
> Support for the Zbb-str-variants is limited to the GNU-assembler
> for now, as LLVM has not yet acquired the functionality to
> selectively change the arch option in assembler code.
> This is still under review at
>     https://reviews.llvm.org/D123515
> 
> Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
> ---
>  arch/riscv/Kconfig                   |  24 ++++++
>  arch/riscv/include/asm/errata_list.h |   3 +-
>  arch/riscv/include/asm/hwcap.h       |   1 +
>  arch/riscv/kernel/cpu.c              |   1 +
>  arch/riscv/kernel/cpufeature.c       |  18 +++++
>  arch/riscv/lib/strcmp.S              |  85 ++++++++++++++++++++++
>  arch/riscv/lib/strlen.S              | 105 +++++++++++++++++++++++++++
>  arch/riscv/lib/strncmp.S             |  98 +++++++++++++++++++++++++
>  8 files changed, 334 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e2b656043abf..7c814fbf9527 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT
>  
>  	   If you don't know what to do here, say Y.
>  
> +config TOOLCHAIN_HAS_ZBB
> +	bool
> +	default y
> +	depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
> +	depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
> +	depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
> +	depends on AS_IS_GNU
> +
> +config RISCV_ISA_ZBB
> +	bool "Zbb extension support for bit manipulation instructions"
> +	depends on TOOLCHAIN_HAS_ZBB
> +	depends on !XIP_KERNEL && MMU
> +	select RISCV_ALTERNATIVE
> +	default y
> +	help
> +	   Adds support to dynamically detect the presence of the ZBB
> +	   extension (basic bit manipulation) and enable its usage.
> +
> +	   The Zbb extension provides instructions to accelerate a number
> +	   of bit-specific operations (count bit population, sign extending,
> +	   bitrotation, etc).
> +
> +	   If you don't know what to do here, say Y.
> +
>  config TOOLCHAIN_HAS_ZICBOM
>  	bool
>  	default y
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index 4180312d2a70..95e626b7281e 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -24,7 +24,8 @@
>  
>  #define	CPUFEATURE_SVPBMT 0
>  #define	CPUFEATURE_ZICBOM 1
> -#define	CPUFEATURE_NUMBER 2
> +#define	CPUFEATURE_ZBB 2
> +#define	CPUFEATURE_NUMBER 3
>  
>  #ifdef __ASSEMBLY__
>  
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 57439da71c77..462d6cde9bac 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -58,6 +58,7 @@ enum riscv_isa_ext_id {
>  	RISCV_ISA_EXT_SSTC,
>  	RISCV_ISA_EXT_SVINVAL,
>  	RISCV_ISA_EXT_SVPBMT,
> +	RISCV_ISA_EXT_ZBB,
>  	RISCV_ISA_EXT_ZICBOM,
>  	RISCV_ISA_EXT_ZIHINTPAUSE,
>  	RISCV_ISA_EXT_ID_MAX
> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> index 0bf1c7f663fc..420228e219f7 100644
> --- a/arch/riscv/kernel/cpu.c
> +++ b/arch/riscv/kernel/cpu.c
> @@ -185,6 +185,7 @@ arch_initcall(riscv_cpuinfo_init);
>   * New entries to this struct should follow the ordering rules described above.
>   */
>  static struct riscv_isa_ext_data isa_ext_arr[] = {
> +	__RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),

Zb* comes after Zi* according 27.11 "Subset Naming Convention". I can hear
you groaning!

The other lists are OK, because we decided to just keep those in
alphabetical order.

>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
>  	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index dde0e91d7668..9899806cef29 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -227,6 +227,7 @@ void __init riscv_fill_hwcap(void)
>  				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>  				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
>  				SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
> +				SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
>  				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
>  				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>  			}
> @@ -302,6 +303,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
>  	return true;
>  }
>  
> +static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
> +{
> +	if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
> +		return false;
> +
> +	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
> +		return false;
> +
> +	if (!riscv_isa_extension_available(NULL, ZBB))
> +		return false;
> +
> +	return true;
> +}
> +
>  /*
>   * Probe presence of individual extensions.
>   *
> @@ -320,6 +335,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>  	if (cpufeature_probe_zicbom(stage))
>  		cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
>  
> +	if (cpufeature_probe_zbb(stage))
> +		cpu_req_feature |= BIT(CPUFEATURE_ZBB);
> +
>  	return cpu_req_feature;
>  }
>  
> diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> index 8babd712b958..8148b6418f61 100644
> --- a/arch/riscv/lib/strcmp.S
> +++ b/arch/riscv/lib/strcmp.S
> @@ -3,9 +3,14 @@
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
>  #include <asm-generic/export.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/errata_list.h>
>  
>  /* int strcmp(const char *cs, const char *ct) */
>  SYM_FUNC_START(strcmp)
> +
> +	ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> +
>  	/*
>  	 * Returns
>  	 *   a0 - comparison result, value like strcmp
> @@ -33,4 +38,84 @@ SYM_FUNC_START(strcmp)
>  	 */
>  	sub	a0, t0, t1
>  	ret
> +
> +/*
> + * Variant of strcmp using the ZBB extension if available

Could call out the source of this being Appendix A of the bitman manual.
Same suggestion for the other variant headers comments as well.

> + */
> +#ifdef CONFIG_RISCV_ISA_ZBB
> +strcmp_zbb:
> +
> +.option push
> +.option arch,+zbb
> +
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, value like strcmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2, t3, t4, t5

t5 isn't used

> +	 */
> +
> +	or	t2, a0, a1
> +	li	t4, -1
> +	and	t2, t2, SZREG-1
> +	bnez	t2, 3f
> +
> +	/* Main loop for aligned string.  */
> +	.p2align 3
> +1:
> +	REG_L	t0, 0(a0)
> +	REG_L	t1, 0(a1)
> +	orc.b	t3, t0
> +	bne	t3, t4, 2f
> +	addi	a0, a0, SZREG
> +	addi	a1, a1, SZREG
> +	beq	t0, t1, 1b
> +
> +	/*
> +	 * Words don't match, and no null byte in the first
> +	 * word. Get bytes in big-endian order and compare.
> +	 */
> +#ifndef CONFIG_CPU_BIG_ENDIAN
> +	rev8	t0, t0
> +	rev8	t1, t1
> +#endif
> +
> +	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
> +	sltu	a0, t0, t1
> +	neg	a0, a0
> +	ori	a0, a0, 1
> +	ret
> +
> +2:
> +	/*
> +	 * Found a null byte.
> +	 * If words don't match, fall back to simple loop.
> +	 */
> +	bne	t0, t1, 3f
> +
> +	/* Otherwise, strings are equal. */
> +	li	a0, 0
> +	ret
> +
> +	/* Simple loop for misaligned strings. */
> +	.p2align 3
> +3:
> +	lbu	t0, 0(a0)
> +	lbu	t1, 0(a1)
> +	addi	a0, a0, 1
> +	addi	a1, a1, 1
> +	bne	t0, t1, 4f
> +	bnez	t0, 3b
> +
> +4:
> +	sub	a0, t0, t1
> +	ret
> +
> +.option pop
> +#endif
>  SYM_FUNC_END(strcmp)
> diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> index 0a3b11853efd..0f9dbf93301a 100644
> --- a/arch/riscv/lib/strlen.S
> +++ b/arch/riscv/lib/strlen.S
> @@ -3,9 +3,14 @@
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
>  #include <asm-generic/export.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/errata_list.h>
>  
>  /* int strlen(const char *s) */
>  SYM_FUNC_START(strlen)
> +
> +	ALTERNATIVE("nop", "j strlen_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> +
>  	/*
>  	 * Returns
>  	 *   a0 - string length
> @@ -25,4 +30,104 @@ SYM_FUNC_START(strlen)
>  2:
>  	sub	a0, t1, a0
>  	ret
> +
> +/*
> + * Variant of strlen using the ZBB extension if available
> + */
> +#ifdef CONFIG_RISCV_ISA_ZBB
> +strlen_zbb:
> +
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> +# define CZ	clz
> +# define SHIFT	sll
> +#else
> +# define CZ	ctz
> +# define SHIFT	srl
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +	/*
> +	 * Returns
> +	 *   a0 - string length
> +	 *
> +	 * Parameters
> +	 *   a0 - String to measure
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2, t3
> +	 */
> +
> +	/* Number of irrelevant bytes in the first word. */
> +	andi	t2, a0, SZREG-1
> +
> +	/* Align pointer. */
> +	andi	t0, a0, -SZREG
> +
> +	li	t3, SZREG
> +	sub	t3, t3, t2
> +	slli	t2, t2, 3
> +
> +	/* Get the first word.  */
> +	REG_L	t1, 0(t0)
> +
> +	/*
> +	 * Shift away the partial data we loaded to remove the irrelevant bytes
> +	 * preceding the string with the effect of adding NUL bytes at the
> +	 * end of the string's first word.
> +	 */
> +	SHIFT	t1, t1, t2
> +
> +	/* Convert non-NUL into 0xff and NUL into 0x00. */
> +	orc.b	t1, t1
> +
> +	/* Convert non-NUL into 0x00 and NUL into 0xff. */
> +	not	t1, t1
> +
> +	/*
> +	 * Search for the first set bit (corresponding to a NUL byte in the
> +	 * original chunk).
> +	 */
> +	CZ	t1, t1
> +
> +	/*
> +	 * The first chunk is special: compare against the number
> +	 * of valid bytes in this chunk.
> +	 */
> +	srli	a0, t1, 3
> +	bgtu	t3, a0, 3f
> +
> +	/* Prepare for the word comparison loop. */
> +	addi	t2, t0, SZREG
> +	li	t3, -1
> +
> +	/*
> +	 * Our critical loop is 4 instructions and processes data in
> +	 * 4 byte or 8 byte chunks.
> +	 */
> +	.p2align 3
> +1:
> +	REG_L	t1, SZREG(t0)
> +	addi	t0, t0, SZREG
> +	orc.b	t1, t1
> +	beq	t1, t3, 1b
> +2:

This 2 label is never used so it can be removed. I see the appendix also
has an unused label here, .Lepilogue, which could also be removed.

> +	not	t1, t1
> +	CZ	t1, t1
> +
> +	/* Get number of processed words.  */
                                   ^ bytes

> +	sub	t2, t0, t2
> +
> +	/* Add number of characters in the first word.  */
> +	add	a0, a0, t2
> +	srli	t1, t1, 3

I'd move this shift up under the CZ a few lines above or down under the
next comment since it's not related to what the comment above it says.

> +
> +	/* Add number of characters in the last word.  */
> +	add	a0, a0, t1
> +3:
> +	ret
> +
> +.option pop
> +#endif
>  SYM_FUNC_END(strlen)
> diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> index 1f644d0a93f6..7940ddab2d48 100644
> --- a/arch/riscv/lib/strncmp.S
> +++ b/arch/riscv/lib/strncmp.S
> @@ -3,9 +3,14 @@
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
>  #include <asm-generic/export.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/errata_list.h>
>  
>  /* int strncmp(const char *cs, const char *ct, size_t count) */
>  SYM_FUNC_START(strncmp)
> +
> +	ALTERNATIVE("nop", "j strncmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> +
>  	/*
>  	 * Returns
>  	 *   a0 - comparison result, value like strncmp
> @@ -38,4 +43,97 @@ SYM_FUNC_START(strncmp)
>  	 */
>  	sub	a0, t0, t1
>  	ret
> +
> +/*
> + * Variant of strncmp using the ZBB extension if available
> + */
> +#ifdef CONFIG_RISCV_ISA_ZBB
> +strncmp_zbb:
> +
> +.option push
> +.option arch,+zbb
> +
> +	/*
> +	 * Returns
> +	 *   a0 - comparison result, like strncmp
> +	 *
> +	 * Parameters
> +	 *   a0 - string1
> +	 *   a1 - string2
> +	 *   a2 - number of characters to compare
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2, t3, t4, t5, t6
> +	 */
> +
> +	or	t2, a0, a1
> +	li	t5, -1
> +	and	t2, t2, SZREG-1
> +	add	t4, a0, a2
> +	bnez	t2, 4f
> +
> +	/* Adjust limit for fast-path.  */
> +	andi	t6, t4, -SZREG
> +
> +	/* Main loop for aligned string.  */
> +	.p2align 3
> +1:
> +	bgt	a0, t6, 3f
> +	REG_L	t0, 0(a0)
> +	REG_L	t1, 0(a1)
> +	orc.b	t3, t0
> +	bne	t3, t5, 2f
> +	addi	a0, a0, SZREG
> +	addi	a1, a1, SZREG
> +	beq	t0, t1, 1b
> +
> +	/*
> +	 * Words don't match, and no null byte in the first
> +	 * word. Get bytes in big-endian order and compare.
> +	 */
> +#ifndef CONFIG_CPU_BIG_ENDIAN
> +	rev8	t0, t0
> +	rev8	t1, t1
> +#endif
> +
> +	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence.  */
> +	sltu	a0, t0, t1
> +	neg	a0, a0
> +	ori	a0, a0, 1
> +	ret
> +
> +2:
> +	/*
> +	 * Found a null byte.
> +	 * If words don't match, fall back to simple loop.
> +	 */
> +	bne	t0, t1, 3f
> +
> +	/* Otherwise, strings are equal.  */
> +	li	a0, 0
> +	ret
> +
> +	/* Simple loop for misaligned strings.  */
> +3:

This label isn't any different than the next, label 4, so it should be
removed.


> +	/* Restore limit for slow-path.  */

nit: We're not really "restoring" anything, but rather using limit which
is saved in a different register.

> +	.p2align 3
> +4:
> +	bge	a0, t4, 6f
> +	lbu	t0, 0(a0)
> +	lbu	t1, 0(a1)
> +	addi	a0, a0, 1
> +	addi	a1, a1, 1
> +	bne	t0, t1, 5f
> +	bnez	t0, 4b
> +
> +5:
> +	sub	a0, t0, t1
> +	ret
> +
> +6:
> +	li	a0, 0
> +	ret
> +
> +.option pop
> +#endif
>  SYM_FUNC_END(strncmp)
> -- 
> 2.35.1
>

Thanks,
drew
Andrew Jones Jan. 20, 2023, 3:02 p.m. UTC | #3
On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote:
...
> diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> index 8babd712b958..8148b6418f61 100644
> --- a/arch/riscv/lib/strcmp.S
> +++ b/arch/riscv/lib/strcmp.S
> @@ -3,9 +3,14 @@
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
>  #include <asm-generic/export.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/errata_list.h>
>  
>  /* int strcmp(const char *cs, const char *ct) */
>  SYM_FUNC_START(strcmp)
> +
> +	ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> +

I have something similar for Zicboz (which I've just reread as I'm
preparing v2). The difference is that I opted to penalize the non-
optimized version with the unconditional jump and give the optimized
version the nop. To do that here, it'd just need the label changed to
strcmp_basic or whatever, push the "basic" code down into it the new
label, and then put the new zbb code here.

Thanks,
drew
Guenter Roeck Feb. 12, 2023, 3:47 p.m. UTC | #4
Hi,

On Fri, Jan 13, 2023 at 10:23:01PM +0100, Heiko Stuebner wrote:
> From: Heiko Stuebner <heiko.stuebner@vrull.eu>
> 
> Add handling for ZBB extension and add support for using it as a
> variant for optimized string functions.
> 
> Support for the Zbb-str-variants is limited to the GNU-assembler
> for now, as LLVM has not yet acquired the functionality to
> selectively change the arch option in assembler code.
> This is still under review at
>     https://reviews.llvm.org/D123515
> 
> Co-developed-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Christoph Muellner <christoph.muellner@vrull.eu>
> Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>

With this patch in linux-next, the qemu "virt" emulation no longer finds
a root filesystem. I don't know how it is related, but the problem is
seen for all types of devices I tried to boot from. Booting from initrd
still works.

There are also repeated

WARNING: CPU: 0 PID: 0 at arch/riscv/kernel/patch.c:63 patch_insn_write+0x222/0x2f6

in the log, but that appears to be unrelated.

Guenter

---
bisect log:

# bad: [6ba8a227fd19d19779005fb66ad7562608e1df83] Add linux-next specific files for 20230210
# good: [4ec5183ec48656cec489c49f989c508b68b518e3] Linux 6.2-rc7
git bisect start 'HEAD' 'v6.2-rc7'
# bad: [94613f0efc69ed41f9229ef5c294db3ec37145da] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
git bisect bad 94613f0efc69ed41f9229ef5c294db3ec37145da
# bad: [8928ece68de4371dc6e1d3336d3029c1e18ae3b4] Merge branch 'for_next' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git
git bisect bad 8928ece68de4371dc6e1d3336d3029c1e18ae3b4
# good: [78a9f460e33d103256f3abb38f02f4759710c7dc] soc: document merges
git bisect good 78a9f460e33d103256f3abb38f02f4759710c7dc
# good: [b35b2472ebafa29d0bbe79fbee1da6ef3c4ec619] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git
git bisect good b35b2472ebafa29d0bbe79fbee1da6ef3c4ec619
# bad: [57a87a64b520c37dd49f5fde84d09a4adb391180] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git
git bisect bad 57a87a64b520c37dd49f5fde84d09a4adb391180
# good: [cfc8ba01cc84b24ec6eb293ec9fba893f7cd4581] Merge branch 'clk-next' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux.git
git bisect good cfc8ba01cc84b24ec6eb293ec9fba893f7cd4581
# good: [6acecfa485d3de955c35a18730c106ddf1e7600e] powerpc/kcsan: Add KCSAN Support
git bisect good 6acecfa485d3de955c35a18730c106ddf1e7600e
# good: [8a16dea453dbc3e834b162640028e505882cd11e] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
git bisect good 8a16dea453dbc3e834b162640028e505882cd11e
# good: [6be1ff430dab9fc047762b10b2c9669399ea1f37] riscv: pgtable: Fixup comment for KERN_VIRT_SIZE
git bisect good 6be1ff430dab9fc047762b10b2c9669399ea1f37
# good: [e0c267e03b0c77c9ac79ac08eada41ba8eb1b95f] riscv: module: move find_section to module.h
git bisect good e0c267e03b0c77c9ac79ac08eada41ba8eb1b95f
# good: [e8ad17d2b5f38e595d597a3e2419d6d7cc727b17] riscv: KVM: Switch has_svinval() to riscv_has_extension_unlikely()
git bisect good e8ad17d2b5f38e595d597a3e2419d6d7cc727b17
# bad: [75ab93a244a516d1d3c03c4e27d5d0deff76ebfb] Merge patch series "Zbb string optimizations"
git bisect bad 75ab93a244a516d1d3c03c4e27d5d0deff76ebfb
# bad: [b6fcdb191e36f82336f9b5e126d51c02e7323480] RISC-V: add zbb support to string functions
git bisect bad b6fcdb191e36f82336f9b5e126d51c02e7323480
# good: [56e0790c7f9e59ba6a0f4b59981d1d6fbf43efb0] RISC-V: add infrastructure to allow different str* implementations
git bisect good 56e0790c7f9e59ba6a0f4b59981d1d6fbf43efb0
# first bad commit: [b6fcdb191e36f82336f9b5e126d51c02e7323480] RISC-V: add zbb support to string functions
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e2b656043abf..7c814fbf9527 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -416,6 +416,30 @@  config RISCV_ISA_SVPBMT
 
 	   If you don't know what to do here, say Y.
 
+config TOOLCHAIN_HAS_ZBB
+	bool
+	default y
+	depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
+	depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
+	depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+	depends on AS_IS_GNU
+
+config RISCV_ISA_ZBB
+	bool "Zbb extension support for bit manipulation instructions"
+	depends on TOOLCHAIN_HAS_ZBB
+	depends on !XIP_KERNEL && MMU
+	select RISCV_ALTERNATIVE
+	default y
+	help
+	   Adds support to dynamically detect the presence of the ZBB
+	   extension (basic bit manipulation) and enable its usage.
+
+	   The Zbb extension provides instructions to accelerate a number
+	   of bit-specific operations (count bit population, sign extending,
+	   bitrotation, etc).
+
+	   If you don't know what to do here, say Y.
+
 config TOOLCHAIN_HAS_ZICBOM
 	bool
 	default y
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index 4180312d2a70..95e626b7281e 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -24,7 +24,8 @@ 
 
 #define	CPUFEATURE_SVPBMT 0
 #define	CPUFEATURE_ZICBOM 1
-#define	CPUFEATURE_NUMBER 2
+#define	CPUFEATURE_ZBB 2
+#define	CPUFEATURE_NUMBER 3
 
 #ifdef __ASSEMBLY__
 
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 57439da71c77..462d6cde9bac 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -58,6 +58,7 @@  enum riscv_isa_ext_id {
 	RISCV_ISA_EXT_SSTC,
 	RISCV_ISA_EXT_SVINVAL,
 	RISCV_ISA_EXT_SVPBMT,
+	RISCV_ISA_EXT_ZBB,
 	RISCV_ISA_EXT_ZICBOM,
 	RISCV_ISA_EXT_ZIHINTPAUSE,
 	RISCV_ISA_EXT_ID_MAX
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 0bf1c7f663fc..420228e219f7 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -185,6 +185,7 @@  arch_initcall(riscv_cpuinfo_init);
  * New entries to this struct should follow the ordering rules described above.
  */
 static struct riscv_isa_ext_data isa_ext_arr[] = {
+	__RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
 	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
 	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
 	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index dde0e91d7668..9899806cef29 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -227,6 +227,7 @@  void __init riscv_fill_hwcap(void)
 				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
 				SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
 				SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
+				SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
 				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
 				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
 			}
@@ -302,6 +303,20 @@  static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
 	return true;
 }
 
+static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
+		return false;
+
+	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+		return false;
+
+	if (!riscv_isa_extension_available(NULL, ZBB))
+		return false;
+
+	return true;
+}
+
 /*
  * Probe presence of individual extensions.
  *
@@ -320,6 +335,9 @@  static u32 __init_or_module cpufeature_probe(unsigned int stage)
 	if (cpufeature_probe_zicbom(stage))
 		cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
 
+	if (cpufeature_probe_zbb(stage))
+		cpu_req_feature |= BIT(CPUFEATURE_ZBB);
+
 	return cpu_req_feature;
 }
 
diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
index 8babd712b958..8148b6418f61 100644
--- a/arch/riscv/lib/strcmp.S
+++ b/arch/riscv/lib/strcmp.S
@@ -3,9 +3,14 @@ 
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
 
 /* int strcmp(const char *cs, const char *ct) */
 SYM_FUNC_START(strcmp)
+
+	ALTERNATIVE("nop", "j strcmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
 	/*
 	 * Returns
 	 *   a0 - comparison result, value like strcmp
@@ -33,4 +38,84 @@  SYM_FUNC_START(strcmp)
 	 */
 	sub	a0, t0, t1
 	ret
+
+/*
+ * Variant of strcmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strcmp_zbb:
+
+.option push
+.option arch,+zbb
+
+	/*
+	 * Returns
+	 *   a0 - comparison result, value like strcmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3, t4, t5
+	 */
+
+	or	t2, a0, a1
+	li	t4, -1
+	and	t2, t2, SZREG-1
+	bnez	t2, 3f
+
+	/* Main loop for aligned string.  */
+	.p2align 3
+1:
+	REG_L	t0, 0(a0)
+	REG_L	t1, 0(a1)
+	orc.b	t3, t0
+	bne	t3, t4, 2f
+	addi	a0, a0, SZREG
+	addi	a1, a1, SZREG
+	beq	t0, t1, 1b
+
+	/*
+	 * Words don't match, and no null byte in the first
+	 * word. Get bytes in big-endian order and compare.
+	 */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	rev8	t0, t0
+	rev8	t1, t1
+#endif
+
+	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+	sltu	a0, t0, t1
+	neg	a0, a0
+	ori	a0, a0, 1
+	ret
+
+2:
+	/*
+	 * Found a null byte.
+	 * If words don't match, fall back to simple loop.
+	 */
+	bne	t0, t1, 3f
+
+	/* Otherwise, strings are equal. */
+	li	a0, 0
+	ret
+
+	/* Simple loop for misaligned strings. */
+	.p2align 3
+3:
+	lbu	t0, 0(a0)
+	lbu	t1, 0(a1)
+	addi	a0, a0, 1
+	addi	a1, a1, 1
+	bne	t0, t1, 4f
+	bnez	t0, 3b
+
+4:
+	sub	a0, t0, t1
+	ret
+
+.option pop
+#endif
 SYM_FUNC_END(strcmp)
diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
index 0a3b11853efd..0f9dbf93301a 100644
--- a/arch/riscv/lib/strlen.S
+++ b/arch/riscv/lib/strlen.S
@@ -3,9 +3,14 @@ 
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
 
 /* int strlen(const char *s) */
 SYM_FUNC_START(strlen)
+
+	ALTERNATIVE("nop", "j strlen_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
 	/*
 	 * Returns
 	 *   a0 - string length
@@ -25,4 +30,104 @@  SYM_FUNC_START(strlen)
 2:
 	sub	a0, t1, a0
 	ret
+
+/*
+ * Variant of strlen using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strlen_zbb:
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ	clz
+# define SHIFT	sll
+#else
+# define CZ	ctz
+# define SHIFT	srl
+#endif
+
+.option push
+.option arch,+zbb
+
+	/*
+	 * Returns
+	 *   a0 - string length
+	 *
+	 * Parameters
+	 *   a0 - String to measure
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3
+	 */
+
+	/* Number of irrelevant bytes in the first word. */
+	andi	t2, a0, SZREG-1
+
+	/* Align pointer. */
+	andi	t0, a0, -SZREG
+
+	li	t3, SZREG
+	sub	t3, t3, t2
+	slli	t2, t2, 3
+
+	/* Get the first word.  */
+	REG_L	t1, 0(t0)
+
+	/*
+	 * Shift away the partial data we loaded to remove the irrelevant bytes
+	 * preceding the string with the effect of adding NUL bytes at the
+	 * end of the string's first word.
+	 */
+	SHIFT	t1, t1, t2
+
+	/* Convert non-NUL into 0xff and NUL into 0x00. */
+	orc.b	t1, t1
+
+	/* Convert non-NUL into 0x00 and NUL into 0xff. */
+	not	t1, t1
+
+	/*
+	 * Search for the first set bit (corresponding to a NUL byte in the
+	 * original chunk).
+	 */
+	CZ	t1, t1
+
+	/*
+	 * The first chunk is special: compare against the number
+	 * of valid bytes in this chunk.
+	 */
+	srli	a0, t1, 3
+	bgtu	t3, a0, 3f
+
+	/* Prepare for the word comparison loop. */
+	addi	t2, t0, SZREG
+	li	t3, -1
+
+	/*
+	 * Our critical loop is 4 instructions and processes data in
+	 * 4 byte or 8 byte chunks.
+	 */
+	.p2align 3
+1:
+	REG_L	t1, SZREG(t0)
+	addi	t0, t0, SZREG
+	orc.b	t1, t1
+	beq	t1, t3, 1b
+2:
+	not	t1, t1
+	CZ	t1, t1
+
+	/* Get number of processed words.  */
+	sub	t2, t0, t2
+
+	/* Add number of characters in the first word.  */
+	add	a0, a0, t2
+	srli	t1, t1, 3
+
+	/* Add number of characters in the last word.  */
+	add	a0, a0, t1
+3:
+	ret
+
+.option pop
+#endif
 SYM_FUNC_END(strlen)
diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
index 1f644d0a93f6..7940ddab2d48 100644
--- a/arch/riscv/lib/strncmp.S
+++ b/arch/riscv/lib/strncmp.S
@@ -3,9 +3,14 @@ 
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
 
 /* int strncmp(const char *cs, const char *ct, size_t count) */
 SYM_FUNC_START(strncmp)
+
+	ALTERNATIVE("nop", "j strncmp_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
+
 	/*
 	 * Returns
 	 *   a0 - comparison result, value like strncmp
@@ -38,4 +43,97 @@  SYM_FUNC_START(strncmp)
 	 */
 	sub	a0, t0, t1
 	ret
+
+/*
+ * Variant of strncmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strncmp_zbb:
+
+.option push
+.option arch,+zbb
+
+	/*
+	 * Returns
+	 *   a0 - comparison result, like strncmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *   a2 - number of characters to compare
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3, t4, t5, t6
+	 */
+
+	or	t2, a0, a1
+	li	t5, -1
+	and	t2, t2, SZREG-1
+	add	t4, a0, a2
+	bnez	t2, 4f
+
+	/* Adjust limit for fast-path.  */
+	andi	t6, t4, -SZREG
+
+	/* Main loop for aligned string.  */
+	.p2align 3
+1:
+	bgt	a0, t6, 3f
+	REG_L	t0, 0(a0)
+	REG_L	t1, 0(a1)
+	orc.b	t3, t0
+	bne	t3, t5, 2f
+	addi	a0, a0, SZREG
+	addi	a1, a1, SZREG
+	beq	t0, t1, 1b
+
+	/*
+	 * Words don't match, and no null byte in the first
+	 * word. Get bytes in big-endian order and compare.
+	 */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	rev8	t0, t0
+	rev8	t1, t1
+#endif
+
+	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence.  */
+	sltu	a0, t0, t1
+	neg	a0, a0
+	ori	a0, a0, 1
+	ret
+
+2:
+	/*
+	 * Found a null byte.
+	 * If words don't match, fall back to simple loop.
+	 */
+	bne	t0, t1, 3f
+
+	/* Otherwise, strings are equal.  */
+	li	a0, 0
+	ret
+
+	/* Simple loop for misaligned strings.  */
+3:
+	/* Restore limit for slow-path.  */
+	.p2align 3
+4:
+	bge	a0, t4, 6f
+	lbu	t0, 0(a0)
+	lbu	t1, 0(a1)
+	addi	a0, a0, 1
+	addi	a1, a1, 1
+	bne	t0, t1, 5f
+	bnez	t0, 4b
+
+5:
+	sub	a0, t0, t1
+	ret
+
+6:
+	li	a0, 0
+	ret
+
+.option pop
+#endif
 SYM_FUNC_END(strncmp)