@@ -2155,6 +2155,9 @@ config ARM64_EPAN
if the cpu does not implement the feature.
endmenu # "ARMv8.7 architectural features"
+config AS_HAS_MOPS
+ def_bool $(as-instr,.arch_extension mops)
+
menu "ARMv8.9 architectural features"
config ARM64_POE
@@ -57,7 +57,7 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memcpy_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ cpyp [dst]!, [src]!, count!
+ cpym [dst]!, [src]!, count!
+ cpye [dst]!, [src]!, count!
+ ret
SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
@@ -26,6 +26,7 @@
*/
dstin .req x0
+val_x .req x1
val .req w1
count .req x2
tmp1 .req x3
@@ -42,7 +43,7 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memset_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ setp [dst]!, count!, val_x
+ setm [dst]!, count!, val_x
+ sete [dst]!, count!, val_x
+ ret
SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)
Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS instructions when implemented on the CPU. The CPY*/SET* instructions copy or set a block of memory of arbitrary size and alignment. They can be interrupted by the CPU and the copying resumed later. Their performance is expected to be close to the best generic copy/set sequence of loads/stores for a given CPU. Using them in the kernel's copy/set routines therefore avoids the need to periodically rewrite the routines to optimize for new microarchitectures. It could also lead to a performance improvement for some CPUs and systems. With this change the kernel will always use the instructions if they are implemented on the CPU (and have not been disabled by the arm64.nomops command line parameter). When not implemented the usual routines will be used (patched via alternatives). Note, we need to patch B/NOP instead of the whole sequence to avoid executing a partially patched sequence in case the compiler generates a mem*() call inside the alternatives patching code. Note that MOPS instructions have relaxed behavior on Device memory, but it is expected that these routines are not generally used on MMIO. Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as CPY* allows overlaps between the source and destination buffers, and despite contradicting the C standard, compilers require that memcpy() work on exactly overlapping source and destination: https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language https://reviews.llvm.org/D86993 Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com> --- arch/arm64/Kconfig | 3 +++ arch/arm64/lib/memcpy.S | 19 ++++++++++++++++++- arch/arm64/lib/memset.S | 20 +++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-)