From patchwork Mon Sep 30 16:10:50 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kristina Martsenko X-Patchwork-Id: 13816678 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 91E15CE8362 for ; Mon, 30 Sep 2024 16:19:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=lists.infradead.org; s=bombadil.20210309; h=Sender:List-Subscribe:List-Help :List-Post:List-Archive:List-Unsubscribe:List-Id:Content-Transfer-Encoding: MIME-Version:References:In-Reply-To:Message-Id:Date:Subject:Cc:To:From: Reply-To:Content-Type:Content-ID:Content-Description:Resent-Date:Resent-From: Resent-Sender:Resent-To:Resent-Cc:Resent-Message-ID:List-Owner; bh=+HPcylOsT4bQ4v7x9ZXvOB0TVxdy1H8E4ba1cIyopOQ=; b=HFrEOvmjYfDJDBl8Rfb8pqZiDK M7SrtWtjk2mJKskAeKvhHplahtGTKLQ8zzGXMhzr3Z725U37X6Sak8zMvZ/n3hNfhqGRf2WhKC8Qq iIZ5UfGGMOMlRZ9VDc+3NFUWlIVpJJ2gDbiS+IuzFtAxnEucDnA5Vc5dzk0QW1g5ZS5ut7I+Ny6AX CMTJzZGCZOQ2aRzzr8cR8eHAZZkvaTT7oPHMTevxg3EmFFOUlP9RfCUbJszuq2MF+VMH2UdWSzamy p71OXoVwi6KtKdQDGnXb+wE9jEX3vu7rBaALEk3OKFRluMspERr3PGdiEVAbQhUMQ2nXw3y8eU/cl kuFPpxeA==; Received: from localhost ([::1] helo=bombadil.infradead.org) by bombadil.infradead.org with esmtp (Exim 4.98 #2 (Red Hat Linux)) id 1svJ7A-00000000DKb-3x3P; Mon, 30 Sep 2024 16:19:12 +0000 Received: from foss.arm.com ([217.140.110.172]) by bombadil.infradead.org with esmtp (Exim 4.98 #2 (Red Hat Linux)) id 1svJ1L-00000000BrS-20AA for linux-arm-kernel@lists.infradead.org; Mon, 30 Sep 2024 16:13:13 +0000 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 1E48F367; Mon, 30 Sep 2024 09:13:40 -0700 (PDT) Received: from e126864.arm.com (usa-sjc-imap-foss1.foss.arm.com [10.121.207.14]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id F2B4E3F58B; Mon, 30 Sep 2024 09:13:08 -0700 (PDT) From: Kristina Martsenko To: linux-arm-kernel@lists.infradead.org Cc: Catalin Marinas , Will Deacon , Mark Rutland , Robin Murphy , Marc Zyngier Subject: [PATCH 4/5] arm64: lib: Use MOPS for memcpy() routines Date: Mon, 30 Sep 2024 17:10:50 +0100 Message-Id: <20240930161051.3777828-5-kristina.martsenko@arm.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20240930161051.3777828-1-kristina.martsenko@arm.com> References: <20240930161051.3777828-1-kristina.martsenko@arm.com> MIME-Version: 1.0 X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20240930_091311_621131_E884A813 X-CRM114-Status: GOOD ( 17.17 ) X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS instructions when implemented on the CPU. The CPY*/SET* instructions copy or set a block of memory of arbitrary size and alignment. They can be interrupted by the CPU and the copying resumed later. Their performance is expected to be close to the best generic copy/set sequence of loads/stores for a given CPU. Using them in the kernel's copy/set routines therefore avoids the need to periodically rewrite the routines to optimize for new microarchitectures. It could also lead to a performance improvement for some CPUs and systems. With this change the kernel will always use the instructions if they are implemented on the CPU (and have not been disabled by the arm64.nomops command line parameter). When not implemented the usual routines will be used (patched via alternatives). Note, we need to patch B/NOP instead of the whole sequence to avoid executing a partially patched sequence in case the compiler generates a mem*() call inside the alternatives patching code. Note that MOPS instructions have relaxed behavior on Device memory, but it is expected that these routines are not generally used on MMIO. Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as CPY* allows overlaps between the source and destination buffers, and despite contradicting the C standard, compilers require that memcpy() work on exactly overlapping source and destination: https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language https://reviews.llvm.org/D86993 Signed-off-by: Kristina Martsenko --- arch/arm64/Kconfig | 3 +++ arch/arm64/lib/memcpy.S | 19 ++++++++++++++++++- arch/arm64/lib/memset.S | 20 +++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..d0fe90ea704d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2155,6 +2155,9 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features" +config AS_HAS_MOPS + def_bool $(as-instr,.arch_extension mops) + menu "ARMv8.9 architectural features" config ARM64_POE diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 4ab48d49c451..9b99106fb95f 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -57,7 +57,7 @@ The loop tail is handled by always copying 64 bytes from the end. */ -SYM_FUNC_START(__pi_memcpy) +SYM_FUNC_START_LOCAL(__pi_memcpy_generic) add srcend, src, count add dstend, dstin, count cmp count, 128 @@ -238,7 +238,24 @@ L(copy64_from_start): stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] ret +SYM_FUNC_END(__pi_memcpy_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memcpy) +alternative_if_not ARM64_HAS_MOPS + b __pi_memcpy_generic +alternative_else_nop_endif + + mov dst, dstin + cpyp [dst]!, [src]!, count! + cpym [dst]!, [src]!, count! + cpye [dst]!, [src]!, count! + ret SYM_FUNC_END(__pi_memcpy) +#else +SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic) +#endif SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) EXPORT_SYMBOL(__memcpy) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index a5aebe82ad73..97157da65ec6 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -26,6 +26,7 @@ */ dstin .req x0 +val_x .req x1 val .req w1 count .req x2 tmp1 .req x3 @@ -42,7 +43,7 @@ dst .req x8 tmp3w .req w9 tmp3 .req x9 -SYM_FUNC_START(__pi_memset) +SYM_FUNC_START_LOCAL(__pi_memset_generic) mov dst, dstin /* Preserve return value. */ and A_lw, val, #255 orr A_lw, A_lw, A_lw, lsl #8 @@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset) ands count, count, zva_bits_x b.ne .Ltail_maybe_long ret +SYM_FUNC_END(__pi_memset_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memset) +alternative_if_not ARM64_HAS_MOPS + b __pi_memset_generic +alternative_else_nop_endif + + mov dst, dstin + setp [dst]!, count!, val_x + setm [dst]!, count!, val_x + sete [dst]!, count!, val_x + ret SYM_FUNC_END(__pi_memset) +#else +SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic) +#endif SYM_FUNC_ALIAS(__memset, __pi_memset) EXPORT_SYMBOL(__memset)