diff mbox series

[v3] riscv: lib: optimize memcmp with ld insn

Message ID 20220906115359.173660-1-zouyipeng@huawei.com (mailing list archive)
State New, archived
Headers show
Series [v3] riscv: lib: optimize memcmp with ld insn | expand

Commit Message

Yipeng Zou Sept. 6, 2022, 11:53 a.m. UTC
Currently memcmp was implemented in c code(lib/string.c), which compare
memory per byte.

This patch use ld insn compare memory per word to improve. From the test
Results, this will take several times optimized.

Alloc 8,4,1KB buffer to compare, each loop 10k times:

Size(B) Min(ns) AVG(ns) //before

8k      40800   46316
4k      26500   32302
1k      15600   17965

Size(B) Min(ns) AVG(ns) //after

8k      16100   21281
4k      14200   16446
1k      12400   14316

Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
---
V2: Patch test data into the commit message,and collect Reviewed-by
Tags.
V3: Fix some spelling mistakes. Improve register naming and coding style.

 arch/riscv/include/asm/string.h |  3 ++
 arch/riscv/lib/Makefile         |  1 +
 arch/riscv/lib/memcmp.S         | 58 +++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 arch/riscv/lib/memcmp.S

Comments

Andrew Jones Sept. 6, 2022, 12:38 p.m. UTC | #1
On Tue, Sep 06, 2022 at 07:53:59PM +0800, Yipeng Zou wrote:
> Currently memcmp was implemented in c code(lib/string.c), which compare
> memory per byte.
> 
> This patch use ld insn compare memory per word to improve. From the test
> Results, this will take several times optimized.
> 
> Alloc 8,4,1KB buffer to compare, each loop 10k times:
> 
> Size(B) Min(ns) AVG(ns) //before
> 
> 8k      40800   46316
> 4k      26500   32302
> 1k      15600   17965
> 
> Size(B) Min(ns) AVG(ns) //after
> 
> 8k      16100   21281
> 4k      14200   16446
> 1k      12400   14316
> 
> Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
> Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
> ---
> V2: Patch test data into the commit message,and collect Reviewed-by
> Tags.
> V3: Fix some spelling mistakes. Improve register naming and coding style.
> 
>  arch/riscv/include/asm/string.h |  3 ++
>  arch/riscv/lib/Makefile         |  1 +
>  arch/riscv/lib/memcmp.S         | 58 +++++++++++++++++++++++++++++++++
>  3 files changed, 62 insertions(+)
>  create mode 100644 arch/riscv/lib/memcmp.S
> 
> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> index 909049366555..3337b43d3803 100644
> --- a/arch/riscv/include/asm/string.h
> +++ b/arch/riscv/include/asm/string.h
> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>  #define __HAVE_ARCH_MEMMOVE
>  extern asmlinkage void *memmove(void *, const void *, size_t);
>  extern asmlinkage void *__memmove(void *, const void *, size_t);
> +#define __HAVE_ARCH_MEMCMP
> +extern int memcmp(const void *, const void *, size_t);
> +
>  /* For those files which don't want to check by kasan. */
>  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>  #define memcpy(dst, src, len) __memcpy(dst, src, len)
> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> index 25d5c9664e57..70773bf0c471 100644
> --- a/arch/riscv/lib/Makefile
> +++ b/arch/riscv/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y			+= delay.o
>  lib-y			+= memcpy.o
>  lib-y			+= memset.o
>  lib-y			+= memmove.o
> +lib-y			+= memcmp.o
>  lib-$(CONFIG_MMU)	+= uaccess.o
>  lib-$(CONFIG_64BIT)	+= tishift.o
>  
> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
> new file mode 100644
> index 000000000000..eea5cc40e081
> --- /dev/null
> +++ b/arch/riscv/lib/memcmp.S
> @@ -0,0 +1,58 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2022 zouyipeng@huawei.com
> + */
> +#include <linux/linkage.h>
> +#include <asm-generic/export.h>
> +#include <asm/asm.h>
> +
> +/*
> + Input Arguments:
> + a0: addr0
> + a1: addr1
> + a2: buffer size
> +
> + Output:
> + a0: return value
> +*/
> +#define data0	a3
> +#define data1	a4
> +#define tmp	t3
> +#define tail	t4
> +
> +/* load and compare */
> +.macro LD_CMP op d0 d1 a0 a1 t1 offset
> +	\op	\d0, 0(\a0)
> +	\op	\d1, 0(\a1)
> +	addi	\a0, \a0, \offset
> +	addi	\a1, \a1, \offset
> +	sub	\t1, \d0, \d1
> +.endm
> +
> +ENTRY(memcmp)
> +	/* test size aligned with SZREG */
> +	andi	tmp, a2, SZREG - 1
> +	/* load tail */
> +	add	tail, a0, a2
> +	sub	tail, tail, tmp
> +	add	a2, a0, a2
> +
> +.LloopWord:
> +	sltu	tmp, a0, tail
> +	beqz	tmp, .LloopByte
> +
> +	LD_CMP	REG_L data0 data1 a0 a1 tmp SZREG
> +	beqz	tmp, .LloopWord
> +	j	.Lreturn
> +
> +.LloopByte:
> +	sltu	tmp, a0, a2
> +	beqz	tmp, .Lreturn
> +
> +	LD_CMP	lbu data0 data1 a0 a1 tmp 1
> +	beqz	tmp, .LloopByte
> +.Lreturn:
> +	mv	a0, tmp
> +	ret
> +END(memcmp)
> +EXPORT_SYMBOL(memcmp);
> -- 
> 2.17.1
>

Thanks, looks nice.

Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Palmer Dabbelt Oct. 13, 2022, 3:23 a.m. UTC | #2
On Tue, 06 Sep 2022 05:38:14 PDT (-0700), ajones@ventanamicro.com wrote:
> On Tue, Sep 06, 2022 at 07:53:59PM +0800, Yipeng Zou wrote:
>> Currently memcmp was implemented in c code(lib/string.c), which compare
>> memory per byte.
>>
>> This patch use ld insn compare memory per word to improve. From the test
>> Results, this will take several times optimized.
>>
>> Alloc 8,4,1KB buffer to compare, each loop 10k times:
>>
>> Size(B) Min(ns) AVG(ns) //before
>>
>> 8k      40800   46316
>> 4k      26500   32302
>> 1k      15600   17965
>>
>> Size(B) Min(ns) AVG(ns) //after
>>
>> 8k      16100   21281
>> 4k      14200   16446
>> 1k      12400   14316
>>
>> Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
>> Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
>> ---
>> V2: Patch test data into the commit message,and collect Reviewed-by
>> Tags.
>> V3: Fix some spelling mistakes. Improve register naming and coding style.
>>
>>  arch/riscv/include/asm/string.h |  3 ++
>>  arch/riscv/lib/Makefile         |  1 +
>>  arch/riscv/lib/memcmp.S         | 58 +++++++++++++++++++++++++++++++++
>>  3 files changed, 62 insertions(+)
>>  create mode 100644 arch/riscv/lib/memcmp.S
>>
>> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
>> index 909049366555..3337b43d3803 100644
>> --- a/arch/riscv/include/asm/string.h
>> +++ b/arch/riscv/include/asm/string.h
>> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>>  #define __HAVE_ARCH_MEMMOVE
>>  extern asmlinkage void *memmove(void *, const void *, size_t);
>>  extern asmlinkage void *__memmove(void *, const void *, size_t);
>> +#define __HAVE_ARCH_MEMCMP
>> +extern int memcmp(const void *, const void *, size_t);
>> +
>>  /* For those files which don't want to check by kasan. */
>>  #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>>  #define memcpy(dst, src, len) __memcpy(dst, src, len)
>> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
>> index 25d5c9664e57..70773bf0c471 100644
>> --- a/arch/riscv/lib/Makefile
>> +++ b/arch/riscv/lib/Makefile
>> @@ -3,6 +3,7 @@ lib-y			+= delay.o
>>  lib-y			+= memcpy.o
>>  lib-y			+= memset.o
>>  lib-y			+= memmove.o
>> +lib-y			+= memcmp.o
>>  lib-$(CONFIG_MMU)	+= uaccess.o
>>  lib-$(CONFIG_64BIT)	+= tishift.o
>>
>> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
>> new file mode 100644
>> index 000000000000..eea5cc40e081
>> --- /dev/null
>> +++ b/arch/riscv/lib/memcmp.S
>> @@ -0,0 +1,58 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2022 zouyipeng@huawei.com
>> + */
>> +#include <linux/linkage.h>
>> +#include <asm-generic/export.h>
>> +#include <asm/asm.h>
>> +
>> +/*
>> + Input Arguments:
>> + a0: addr0
>> + a1: addr1
>> + a2: buffer size
>> +
>> + Output:
>> + a0: return value
>> +*/
>> +#define data0	a3
>> +#define data1	a4
>> +#define tmp	t3
>> +#define tail	t4
>> +
>> +/* load and compare */
>> +.macro LD_CMP op d0 d1 a0 a1 t1 offset
>> +	\op	\d0, 0(\a0)
>> +	\op	\d1, 0(\a1)
>> +	addi	\a0, \a0, \offset
>> +	addi	\a1, \a1, \offset
>> +	sub	\t1, \d0, \d1
>> +.endm
>> +
>> +ENTRY(memcmp)
>> +	/* test size aligned with SZREG */
>> +	andi	tmp, a2, SZREG - 1
>> +	/* load tail */
>> +	add	tail, a0, a2
>> +	sub	tail, tail, tmp
>> +	add	a2, a0, a2
>> +
>> +.LloopWord:
>> +	sltu	tmp, a0, tail
>> +	beqz	tmp, .LloopByte
>> +
>> +	LD_CMP	REG_L data0 data1 a0 a1 tmp SZREG
>> +	beqz	tmp, .LloopWord
>> +	j	.Lreturn
>> +
>> +.LloopByte:
>> +	sltu	tmp, a0, a2
>> +	beqz	tmp, .Lreturn
>> +
>> +	LD_CMP	lbu data0 data1 a0 a1 tmp 1
>> +	beqz	tmp, .LloopByte
>> +.Lreturn:
>> +	mv	a0, tmp
>> +	ret
>> +END(memcmp)
>> +EXPORT_SYMBOL(memcmp);
>> --
>> 2.17.1
>>
>
> Thanks, looks nice.

Ya, I think normally for performance improvements I'd want a bit more 
(like "what was this benchmarked on", for example).  This one seems 
straight-forward enough to just handle open-loop, though, so I think 
it's OK this time.  It's also something we could probably do in C, but 
IIUC we're meant to have arch-specific routines for these anyway and 
we're going to end up with assembly at some point so we might as well 
just start now.

It does trigger a bunch of build failures, though.  This fixes most of the
RISC-V related issues:

    diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
    index 3337b43d3803..33bd88e17469 100644
    --- a/arch/riscv/include/asm/string.h
    +++ b/arch/riscv/include/asm/string.h
    @@ -19,13 +19,15 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
     extern asmlinkage void *memmove(void *, const void *, size_t);
     extern asmlinkage void *__memmove(void *, const void *, size_t);
     #define __HAVE_ARCH_MEMCMP
    -extern int memcmp(const void *, const void *, size_t);
    +extern asmlinkage int memcmp(const void *, const void *, size_t);
    +extern asmlinkage int __memcmp(const void *, const void *, size_t);
     
     /* For those files which don't want to check by kasan. */
     #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
     #define memcpy(dst, src, len) __memcpy(dst, src, len)
     #define memset(s, c, n) __memset(s, c, n)
     #define memmove(dst, src, len) __memmove(dst, src, len)
    +#define memcmp(a, b, len) __memcmp(a, b, len)
     
     #ifndef __NO_FORTIFY
     #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
    diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
    index eea5cc40e081..518797cd6dda 100644
    --- a/arch/riscv/lib/memcmp.S
    +++ b/arch/riscv/lib/memcmp.S
    @@ -29,7 +29,8 @@
     	sub	\t1, \d0, \d1
     .endm
     
    -ENTRY(memcmp)
    +SYM_FUNC_START(__memcmp)
    +SYM_FUNC_START_WEAK(memcmp)
     	/* test size aligned with SZREG */
     	andi	tmp, a2, SZREG - 1
     	/* load tail */
    @@ -54,5 +55,9 @@ ENTRY(memcmp)
     .Lreturn:
     	mv	a0, tmp
     	ret
    -END(memcmp)
    -EXPORT_SYMBOL(memcmp);
    +
    +SYM_FUNC_END(memcmp)
    +SYM_FUNC_END(__memcmp)
    +
    +EXPORT_SYMBOL(memcmp)
    +EXPORT_SYMBOL(__memcmp)
    diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
    index dd58e1d99397..359227e392b2 100644
    --- a/arch/riscv/purgatory/Makefile
    +++ b/arch/riscv/purgatory/Makefile
    @@ -1,7 +1,7 @@
     # SPDX-License-Identifier: GPL-2.0
     OBJECT_FILES_NON_STANDARD := y
     
    -purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
    +purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o memcmp.o
     
     targets += $(purgatory-y)
     PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
    @@ -18,6 +18,9 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
     $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
     	$(call if_changed_rule,as_o_S)
     
    +$(obj)/memcmp.o: $(srctree)/arch/riscv/lib/memcmp.S FORCE
    +	$(call if_changed_rule,as_o_S)
    +
     $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
     	$(call if_changed_rule,cc_o_c)

but I'm still getting some EFI stub failures

    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt.stub.o: in function `__efistub_.L130':
    __efistub_fdt.c:(.init.text+0x686): undefined reference to `__efistub___memcmp'
    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L87':
    __efistub_fdt_ro.c:(.init.text+0x554): undefined reference to `__efistub___memcmp'
    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L108':
    __efistub_fdt_ro.c:(.init.text+0x6fc): undefined reference to `__efistub___memcmp'
    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L263':
    __efistub_fdt_ro.c:(.init.text+0xfe0): undefined reference to `__efistub___memcmp'
    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L284':
    __efistub_fdt_ro.c:(.init.text+0x10c0): undefined reference to `__efistub___memcmp'
    riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o:__efistub_fdt_ro.c:(.init.text+0x11fc): more undefined references to `__efistub___memcmp' follow
    riscv64-unknown-linux-gnu-ld: .tmp_vmlinux.kallsyms1: hidden symbol `__efistub___memcmp' isn't defined
    riscv64-unknown-linux-gnu-ld: final link failed: bad value
    make[2]: *** [/scratch/merges/ko-linux-next/linux/Makefile:1240: vmlinux] Error 1
    make[2]: Leaving directory '/scratch/merges/ko-linux-next/kernel/rv64gc/allyesconfig'

so I think it's going to be too late for 6.1 for this one, sorry!

>
> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..3337b43d3803 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -18,6 +18,9 @@  extern asmlinkage void *__memcpy(void *, const void *, size_t);
 #define __HAVE_ARCH_MEMMOVE
 extern asmlinkage void *memmove(void *, const void *, size_t);
 extern asmlinkage void *__memmove(void *, const void *, size_t);
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
+
 /* For those files which don't want to check by kasan. */
 #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..70773bf0c471 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -3,6 +3,7 @@  lib-y			+= delay.o
 lib-y			+= memcpy.o
 lib-y			+= memset.o
 lib-y			+= memmove.o
+lib-y			+= memcmp.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 
diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
new file mode 100644
index 000000000000..eea5cc40e081
--- /dev/null
+++ b/arch/riscv/lib/memcmp.S
@@ -0,0 +1,58 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 zouyipeng@huawei.com
+ */
+#include <linux/linkage.h>
+#include <asm-generic/export.h>
+#include <asm/asm.h>
+
+/*
+ Input Arguments:
+ a0: addr0
+ a1: addr1
+ a2: buffer size
+
+ Output:
+ a0: return value
+*/
+#define data0	a3
+#define data1	a4
+#define tmp	t3
+#define tail	t4
+
+/* load and compare */
+.macro LD_CMP op d0 d1 a0 a1 t1 offset
+	\op	\d0, 0(\a0)
+	\op	\d1, 0(\a1)
+	addi	\a0, \a0, \offset
+	addi	\a1, \a1, \offset
+	sub	\t1, \d0, \d1
+.endm
+
+ENTRY(memcmp)
+	/* test size aligned with SZREG */
+	andi	tmp, a2, SZREG - 1
+	/* load tail */
+	add	tail, a0, a2
+	sub	tail, tail, tmp
+	add	a2, a0, a2
+
+.LloopWord:
+	sltu	tmp, a0, tail
+	beqz	tmp, .LloopByte
+
+	LD_CMP	REG_L data0 data1 a0 a1 tmp SZREG
+	beqz	tmp, .LloopWord
+	j	.Lreturn
+
+.LloopByte:
+	sltu	tmp, a0, a2
+	beqz	tmp, .Lreturn
+
+	LD_CMP	lbu data0 data1 a0 a1 tmp 1
+	beqz	tmp, .LloopByte
+.Lreturn:
+	mv	a0, tmp
+	ret
+END(memcmp)
+EXPORT_SYMBOL(memcmp);