@@ -25,7 +25,7 @@ SYM_FUNC_START(__memset)
*
* Otherwise, use original memset function.
*/
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9
@@ -66,7 +66,8 @@ SYM_FUNC_START_LOCAL(memset_erms)
ret
SYM_FUNC_END(memset_erms)
-SYM_FUNC_START_LOCAL(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START_LOCAL(memset_\OP)
movq %rdi,%r10
/* expand byte value */
@@ -77,64 +78,71 @@ SYM_FUNC_START_LOCAL(memset_orig)
/* align dst */
movl %edi,%r9d
andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
+ jnz .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
movq %rdx,%rcx
shrq $6,%rcx
- jz .Lhandle_tail
+ jz .Lhandle_tail_\@
.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq %rcx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
+ \OP %rax,(%rdi)
+ \OP %rax,8(%rdi)
+ \OP %rax,16(%rdi)
+ \OP %rax,24(%rdi)
+ \OP %rax,32(%rdi)
+ \OP %rax,40(%rdi)
+ \OP %rax,48(%rdi)
+ \OP %rax,56(%rdi)
leaq 64(%rdi),%rdi
- jnz .Lloop_64
+ jnz .Lloop_64_\@
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl %edx,%ecx
andl $63&(~7),%ecx
- jz .Lhandle_7
+ jz .Lhandle_7_\@
shrl $3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl %ecx
- movq %rax,(%rdi)
+ \OP %rax,(%rdi)
leaq 8(%rdi),%rdi
- jnz .Lloop_8
+ jnz .Lloop_8_\@
-.Lhandle_7:
+.Lhandle_7_\@:
andl $7,%edx
- jz .Lende
+ jz .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
- jnz .Lloop_1
+ jnz .Lloop_1_\@
-.Lende:
+.Lende_\@:
+ .if \fence
+ sfence
+ .endif
movq %r10,%rax
ret
-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
- jbe .Lhandle_7
+ jbe .Lhandle_7_\@
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
- jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+ jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-MEMSET_FN(memset_orig,
+MEMSET_FN(memset_movq,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")
@@ -11,3 +11,7 @@ MEMSET_FN(__memset,
MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_movnti,
+ "x86-64-movnt",
+ "movnt-based memset() in arch/x86/lib/memset_64.S")
Clone memset_movnti() from arch/x86/lib/memset_64.S. perf bench mem memset -f x86-64-movnt on Intel Icelake-X, AMD Milan: # Intel Icelake-X $ for i in 8 32 128 512; do perf bench mem memset -f x86-64-movnt -s ${i}MB -l 5 done # Output pruned. # Running 'mem/memset' benchmark: # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S) # Copying 8MB bytes ... 12.896170 GB/sec # Copying 32MB bytes ... 15.879065 GB/sec # Copying 128MB bytes ... 20.813214 GB/sec # Copying 512MB bytes ... 24.190817 GB/sec # AMD Milan $ for i in 8 32 128 512; do perf bench mem memset -f x86-64-movnt -s ${i}MB -l 5 done # Output pruned. # Running 'mem/memset' benchmark: # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S) # Copying 8MB bytes ... 22.372566 GB/sec # Copying 32MB bytes ... 22.507923 GB/sec # Copying 128MB bytes ... 22.492532 GB/sec # Copying 512MB bytes ... 22.434603 GB/sec Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> --- tools/arch/x86/lib/memset_64.S | 68 +++++++++++--------- tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 +- 2 files changed, 43 insertions(+), 31 deletions(-)