From patchwork Mon Nov 25 04:11:24 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884372
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id F29B828684;
	Mon, 25 Nov 2024 04:12:48 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507969; cv=none;
 b=mbKvgDIJaSLT0TEo0UTQil9IMX2/YV4pfHJMbEzc73kw6lCPOXJoFbKXVKYjOyCOQG+6hpiZzS4UUZc9sbs1PCJJaWTsljYn6g4OVardZLM2RMVOPzESlk6JVuzf5Y5vNvScuA2lK2Bn9QgYt7x/UkhfzyZPnPQ/0sn2PRcsAGg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507969; c=relaxed/simple;
	bh=K2z+bqdPllLXnkXaOgcFZIyBICogbckaPeIOPNK1P4w=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=HObIuld3kjFxZMq2aIQpaZX0RXPBYr6FHcBcoMwKfQgEbFLEQpzVAP8Hzx484kZRlpxmarpC3qdwjx8Td0fENnjSxz4E9eqUjHWQpzAu8YqUG/X1b2I+a7ZXaHtoZ5Kb1ihNI+LOtz5GFRiJWbGWA32gP/qvf4AEC8uxGlkeoAg=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=hX0TcKRW; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="hX0TcKRW"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id A22FBC4CED6;
	Mon, 25 Nov 2024 04:12:48 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507968;
	bh=K2z+bqdPllLXnkXaOgcFZIyBICogbckaPeIOPNK1P4w=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=hX0TcKRWcodt4aHwM471fRVvfx2Jjr8M2xDXz3mc1WycQQ7Z6qO6siklsvHz9fCiz
	 VoS1nlMrDbi15r33GjKNJEVBejmIiBjYXEXjG/98SXFF+CXsW+furyoFrM16teclqs
	 vGbhsMsAEwwXJHzsJjNMfJ+Fcmx0qjP9XBZlYxmT88CqklIFZord4F66xv0H2pG6RI
	 JLHTsgJmLayr/KscDfwAl1IlQvtvoHjeavhDCkFal8p1xsgA5Uut49iDbpu7uaP88a
	 lB7D6vWZnJyDYPXwJeaAnqWK1o9hS8sIK7b/EkEyD9ylgqeRTTXbmXy+u0LdlCFtg0
	 /bv2a+bxak57w==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 1/6] x86: move zmm exclusion list into CPU feature flag
Date: Sun, 24 Nov 2024 20:11:24 -0800
Message-ID: <20241125041129.192999-2-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Lift zmm_exclusion_list in aesni-intel_glue.c into the x86 CPU setup
code, and add a new x86 CPU feature flag X86_FEATURE_PREFER_YMM that is
set when the CPU is on this list.

This allows other code in arch/x86/, such as the CRC library code, to
apply the same exclusion list when deciding whether to execute 256-bit
or 512-bit optimized functions.

Note that full AVX512 support including zmm registers is still exposed
to userspace and is still supported for in-kernel use.  This flag just
indicates whether in-kernel code should prefer to use ymm registers.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/aesni-intel_glue.c | 22 +---------------------
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/kernel/cpu/intel.c        | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index fbf43482e1f5e..8e648abfb5ab8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1534,30 +1534,10 @@ DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
 DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
 		"generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
 		AES_GCM_KEY_AVX10_SIZE, 800);
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
 
-/*
- * This is a list of CPU models that are known to suffer from downclocking when
- * zmm registers (512-bit vectors) are used.  On these CPUs, the AES mode
- * implementations with zmm registers won't be used by default.  Implementations
- * with ymm registers (256-bit vectors) will be used by default instead.
- */
-static const struct x86_cpu_id zmm_exclusion_list[] = {
-	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_L,		0),
-	X86_MATCH_VFM(INTEL_ICELAKE_NNPI,	0),
-	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	0),
-	X86_MATCH_VFM(INTEL_TIGERLAKE,		0),
-	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
-	/* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
-	{},
-};
-
 static int __init register_avx_algs(void)
 {
 	int err;
 
 	if (!boot_cpu_has(X86_FEATURE_AVX))
@@ -1598,11 +1578,11 @@ static int __init register_avx_algs(void)
 					 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
 					 aes_gcm_simdalgs_vaes_avx10_256);
 	if (err)
 		return err;
 
-	if (x86_match_cpu(zmm_exclusion_list)) {
+	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
 		int i;
 
 		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
 		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
 			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 17b6590748c00..948bfa25ccc7b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -477,10 +477,11 @@
 #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* BHI_DIS_S HW control enabled */
 #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
 #define X86_FEATURE_AMD_FAST_CPPC	(21*32 + 5) /* Fast CPPC */
 #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
 #define X86_FEATURE_AMD_WORKLOAD_CLASS	(21*32 + 7) /* Workload Classification */
+#define X86_FEATURE_PREFER_YMM		(21*32 + 8) /* Avoid zmm registers due to downclocking */
 
 /*
  * BUG word(s)
  */
 #define X86_BUG(x)			(NCAPINTS*32 + (x))
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d1de300af1737..0beb44c4ac026 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -519,10 +519,29 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 
 	msr = this_cpu_read(msr_misc_features_shadow);
 	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
 }
 
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * zmm registers (512-bit vectors) are used.  On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (ymm) code to 512-bit (zmm) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
+	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
+	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
+	X86_MATCH_VFM(INTEL_ICELAKE,		0),
+	X86_MATCH_VFM(INTEL_ICELAKE_L,		0),
+	X86_MATCH_VFM(INTEL_ICELAKE_NNPI,	0),
+	X86_MATCH_VFM(INTEL_TIGERLAKE_L,	0),
+	X86_MATCH_VFM(INTEL_TIGERLAKE,		0),
+	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+	{},
+};
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	early_init_intel(c);
 
 	intel_workarounds(c);
@@ -602,10 +621,13 @@ static void init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_P4);
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_P3);
 #endif
 
+	if (x86_match_cpu(zmm_exclusion_list))
+		set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
+
 	/* Work around errata */
 	srat_detect_node(c);
 
 	init_ia32_feat_ctl(c);
 

From patchwork Mon Nov 25 04:11:25 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884374
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7999054765;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507969; cv=none;
 b=c5XvCRG++DGFjGXvQ73jjepvK/9zh9CZUsF5sSV0XHFKazVW0V+TFkaO866JZtexfuQr8cCJp5vbxh6uDzHaQZLUCe8DRODDF7BUCCntf2wLsl9N3Pr+fNOomDYWmU9Uheslp1u9U5GbiWO2t+ct7sAw2vKhT8mfd1JLYYp+tkw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507969; c=relaxed/simple;
	bh=EKZoO79T9Nc9IFQIIGzuuvtCJUtnaaR0gNdJqjM6Yck=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=SNo+G25luzZlAyfFzfniOpREbxEk6kgwPbKedchCvvpBuWbrdteNTSaHfMIdZpeZASuwyFsolWAWESNNYgDt8vvsQpnCrY37TescayZ2VZxL9xSuToF12ncact3xsVH/sEwKkV3RjG2xjI9TMETNBgEfHpZyKWfYnO2WnPGeU3U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=tM9T2hcG; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="tM9T2hcG"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id EA115C4CED9;
	Mon, 25 Nov 2024 04:12:48 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507969;
	bh=EKZoO79T9Nc9IFQIIGzuuvtCJUtnaaR0gNdJqjM6Yck=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=tM9T2hcGWIv1uzdjdgIFNDkTDoY/THlYhRbw1MFCv5O/nPJKk8EUgloSXCtIx+aHv
	 Unkg1xOLgHJ5dW+ghVqXo4Z2zqK/42vX75dOCa8mlnRyosakL9z2Bl0q+34kx6AN/3
	 h4LO1oCsM6LoM6mmY5HP8pRwBRiU6kTfeUvjooxh3JR3haXfEHKMNw2jPcpMf2SmRY
	 29oukIJW1b1TEACWzBal/ciTeOJryJ/e7ZsoJ0UfzvvPHbxDAOWFLLJoCJn8DPTfeJ
	 P3YoPU9hgYkUiRrsMd9c7PEp1R7g8LgFUO0iVcnKoICxElMepTHOZbcFcvsV3SoVZ/
	 hQInkhAY5v4Qg==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 2/6] scripts/crc: add gen-crc-consts.py
Date: Sun, 24 Nov 2024 20:11:25 -0800
Message-ID: <20241125041129.192999-3-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Add a Python script that generates constants for computing the given CRC
variant(s) using x86's pclmulqdq or vpclmulqdq instructions.

It can also generate the traditional byte-at-a-time tables.

Only small changes should be needed for this script to also work to
generate the constants needed for CRC computation on other architectures
with a carryless multiplication instruction, such as arm64.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 scripts/crc/gen-crc-consts.py | 207 ++++++++++++++++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100755 scripts/crc/gen-crc-consts.py

diff --git a/scripts/crc/gen-crc-consts.py b/scripts/crc/gen-crc-consts.py
new file mode 100755
index 0000000000000..84f0902e1cd7b
--- /dev/null
+++ b/scripts/crc/gen-crc-consts.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Script that generates constants for computing the given CRC variant(s).
+#
+# Copyright 2024 Google LLC
+#
+# Author: Eric Biggers <ebiggers@google.com>
+
+import sys
+
+# XOR (add) an iterable of polynomials.
+def xor(iterable):
+    res = 0
+    for val in iterable:
+        res ^= val
+    return res
+
+# Multiply two polynomials.
+def clmul(a, b):
+    return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
+
+# Polynomial division floor(a / b).
+def div(a, b):
+    q = 0
+    while a.bit_length() >= b.bit_length():
+        q ^= 1 << (a.bit_length() - b.bit_length())
+        a ^= b << (a.bit_length() - b.bit_length())
+    return q
+
+# Reduce the polynomial 'a' modulo the polynomial 'b'.
+def reduce(a, b):
+    return a ^ clmul(div(a, b), b)
+
+# Pretty-print a polynomial.
+def pprint_poly(prefix, poly):
+    terms = ['1' if i == 0 else 'x' if i == 1 else f'x^{i}'
+             for i in reversed(range(poly.bit_length()))
+             if (poly & (1 << i)) != 0]
+    j = 0
+    while j < len(terms):
+        s = prefix + terms[j] + (' +' if j < len(terms) - 1 else '')
+        j += 1
+        while j < len(terms) and len(s) < 72:
+            s += ' ' + terms[j] + (' +' if j < len(terms) - 1 else '')
+            j += 1
+        print(s)
+        prefix = ' * ' + (' ' * (len(prefix) - 3))
+
+# Reverse the bits of a polynomial.
+def bitreverse(poly, num_bits):
+    return xor(1 << (num_bits - 1 - i) for i in range(num_bits)
+               if (poly & (1 << i)) != 0)
+
+# Format a polynomial as hex.  Bit-reflect it if the CRC is LSB-first.
+def fmt_poly(variant, poly, num_bits):
+    if variant.lsb:
+        poly = bitreverse(poly, num_bits)
+    return f'0x{poly:0{2*num_bits//8}x}'
+
+# Print a comment describing constants generated for the given CRC variant.
+def print_header(variant, what):
+    print('/*')
+    s = f'{"least" if variant.lsb else "most"}-significant-bit-first CRC-{variant.bits}'
+    print(f' * {what} generated for {s} using')
+    pprint_poly(' * G(x) = ', variant.G)
+    print(' */')
+
+# Print a polynomial as hex, but drop a term if needed to keep it in 64 bits.
+def print_poly_truncate65thbit(variant, poly, num_bits, desc):
+    if num_bits > 64:
+        assert num_bits == 65
+        if variant.lsb:
+            assert (poly & 1) != 0
+            poly >>= 1
+            desc += ' - 1'
+        else:
+            poly ^= 1 << 64
+            desc += ' - x^64'
+        num_bits = 64
+    print(f'\t\t{fmt_poly(variant, poly, num_bits)},\t/* {desc} */')
+
+class CrcVariant:
+    def __init__(self, bits, generator_poly, bit_order):
+        self.bits = bits
+        if bit_order not in ['lsb', 'msb']:
+            raise ValueError('Invalid value for bit_order')
+        self.lsb = bit_order == 'lsb'
+        self.name = f'crc{bits}_{bit_order}_0x{generator_poly:0{(2*bits+7)//8}x}'
+        if self.lsb:
+            generator_poly = bitreverse(generator_poly, bits)
+        self.G = generator_poly ^ (1 << bits)
+
+# Generate tables for byte-at-a-time CRC computation.
+def gen_sliceby1_tables(variants):
+    for v in variants:
+        print('')
+        print_header(v, 'CRC table')
+        print(f'static const u{v.bits} __maybe_unused {v.name}_table[256] = {{')
+        s = ''
+        for i in range(256):
+            remainder = (bitreverse(i, 8) if v.lsb else i) << (v.bits - 8)
+            for _ in range(8):
+                remainder <<= 1
+                if (remainder & (1 << v.bits)) != 0:
+                    remainder ^= v.G
+            next_entry = fmt_poly(v, remainder, v.bits) + ','
+            if len(s + next_entry) > 71:
+                print(f'\t{s}')
+                s = ''
+            s += (' ' if s else '') + next_entry
+        if s:
+            print(f'\t{s}')
+        print('};')
+
+# Generate constants for carryless multiplication based CRC computation.
+def gen_x86_pclmul_consts(variants):
+    # These are the distances, in bits, to generate folding constants for.
+    FOLD_DISTANCES = [2048, 1024, 512, 256, 128]
+
+    for v in variants:
+        print('')
+        print_header(v, 'CRC folding constants')
+        print('static const struct {')
+        if not v.lsb:
+            print('\tu8 bswap_mask[16];')
+        for i in FOLD_DISTANCES:
+            print(f'\tu64 fold_across_{i}_bits_consts[2];')
+        print('\tu8 shuf_table[48];')
+        print('\tu64 barrett_reduction_consts[2];')
+        if v.lsb and v.bits < 64:
+            print('\tu64 extract_crc_mask[2];')
+        print(f'}} {v.name}_consts __cacheline_aligned __maybe_unused = {{')
+
+        # Byte-reflection mask, needed for MSB CRCs
+        if not v.lsb:
+            print('\t.bswap_mask = {' + ', '.join(str(i) for i in reversed(range(16))) + '},')
+
+        # Fold constants for all distances down to 128 bits
+        k = v.bits - 65 if v.lsb else 0
+        for i in FOLD_DISTANCES:
+            print(f'\t.fold_across_{i}_bits_consts = {{')
+            for j in [64, 0] if v.lsb else [0, 64]:
+                const = reduce(1<<(i+j+k), v.G)
+                pow_desc = f'{i}{"+" if j >= 0 else "-"}{abs(j)}'
+                if k != 0:
+                    pow_desc += f'{"+" if k >= 0 else "-"}{abs(k)}'
+                print(f'\t\t{fmt_poly(v, const, v.bits)},\t/* x^({pow_desc}) mod G(x) */')
+            print('\t},')
+
+        # Shuffle table for handling 1..15 bytes at end
+        print('\t.shuf_table = {')
+        print('\t\t' + (16*'-1, ').rstrip())
+        print('\t\t' + ''.join(f'{i:2}, ' for i in range(16)).rstrip())
+        print('\t\t' + (16*'-1, ').rstrip())
+        print('\t},')
+
+        # Barrett reduction constants for reducing 128 bits to the final CRC
+        m = 63 if v.lsb else 64
+        print('\t.barrett_reduction_consts = {')
+        print_poly_truncate65thbit(v, div(1<<(m+v.bits), v.G), m+1,
+                                   f'floor(x^{m+v.bits} / G(x))')
+        print_poly_truncate65thbit(v, v.G, v.bits+1, 'G(x)')
+        print('\t},')
+        if v.lsb and v.bits < 64:
+            print(f'\t.extract_crc_mask = {{0, 0x{(1<<(v.bits))-1:x}}},')
+
+        print('};')
+
+def parse_crc_variants(vars_string):
+    variants = []
+    for var_string in vars_string.split(','):
+        bits, bit_order, generator_poly = var_string.split('_')
+        assert bits.startswith('crc')
+        bits = int(bits.removeprefix('crc'))
+        assert generator_poly.startswith('0x')
+        generator_poly = generator_poly.removeprefix('0x')
+        assert len(generator_poly) % 2 == 0
+        generator_poly = int(generator_poly, 16)
+        variants.append(CrcVariant(bits, generator_poly, bit_order))
+    return variants
+
+if len(sys.argv) != 3:
+    sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n')
+    sys.stderr.write('  CONSTS_TYPE can be sliceby1 or x86_pclmul\n')
+    sys.stderr.write('  CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n')
+    sys.stderr.write('     E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n')
+    sys.stderr.write('     Polynomial must use the given bit_order and exclude x^{num_bits}\n')
+    sys.exit(1)
+
+print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
+print('/*')
+print(' * CRC constants generated by:')
+print(' *')
+print(f' *\t{sys.argv[0]} {" ".join(sys.argv[1:])}')
+print(' *')
+print(' * Do not edit manually.')
+print(' */')
+consts_types = sys.argv[1].split(',')
+variants = parse_crc_variants(sys.argv[2])
+for consts_type in consts_types:
+    if consts_type == 'sliceby1':
+        gen_sliceby1_tables(variants)
+    elif consts_type == 'x86_pclmul':
+        gen_x86_pclmul_consts(variants)
+    else:
+        raise ValueError(f'Unknown consts_type: {consts_type}')

From patchwork Mon Nov 25 04:11:26 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884375
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id C10DE84039;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507969; cv=none;
 b=ii60eINSlIWEF4V86lxRHDw2cRPXlFkj74CN4iW5u437WpiFTH9dfpv19JpJqlit2CHIY+gRKX8+NQnztDc6vBLPkitfC4HKHZQctZKBCvFQjru7juCPlzwh04I+s3d2xIBHdnN81ScbjP03Jgv7JG0gTu4kDPBvaEuPAIHBRQk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507969; c=relaxed/simple;
	bh=vii0NZz5qfQ4U8lwiyzo1Zr5LJ0YQJ4B8xXF3umdvvI=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=Zx9T6UCsEl4OBYHh7ObQIYKgu5CrWVWw/hntYLBrFVaC0MS9mS+QqxnjKaESLXBMzacncwD2wioZfEP3EWPYlFqXtkl+mPEm7pJyZt2qW+lMYuc+AXQUHOxjAlKRufbx2QeXxPyvLrEYauNc2sSHClIdQMzklOWmeEshvxLYLDk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=cQckXhSj; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="cQckXhSj"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 37EC8C4CED3;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507969;
	bh=vii0NZz5qfQ4U8lwiyzo1Zr5LJ0YQJ4B8xXF3umdvvI=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=cQckXhSjdUmiMEv2QqbMwQZvEhZfHrv4ZUVmAEThoGnzdWbqgj6CvZOiFl5bgf4Qb
	 VNqERf+enf+cXYXguXBhdrhb0gRMh+iZtPaotWEFOUn1KmlqT+SqcTs9EDjCv9CegR
	 HnpULlMES2DNHlHArWf505FP+cAAfyXuKqpm77BOhr29WchqAEPhPl2bUpT0yE7PoL
	 YkWAR7WOx2irkZYBiILoGLBJZqBrEbwPp/q1OeVqaSZ+VgOFgFBwtttLHiP+A22wD1
	 TqGVnsNog4Z/1XrNCXFt06rrIioPn6AH/tZyWcOWdeaNkR4pbhVPt0KcD1lxndJX1X
	 CTbDUdGHrYY8w==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 3/6] x86/crc: add "template" for [V]PCLMULQDQ based CRC
 functions
Date: Sun, 24 Nov 2024 20:11:26 -0800
Message-ID: <20241125041129.192999-4-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

The Linux kernel implements many variants of CRC, such as crc16,
crct10dif, crc32_le, crc32c, crc32_be, crc64_rocksoft, and crc64_be.  On
x86, except for crc32c which has special x86 scalar instructions, the
fastest way to compute any of these CRCs is to use the SIMD carryless
multiplication instructions PCLMULQDQ or VPCLMULQDQ.  Depending on the
available CPU features this can mean PCLMULQDQ+SSE4.1, VPCLMULQDQ+AVX2,
VPCLMULQDQ+AVX10/256, or VPCLMULQDQ+AVX10/512 (or the AVX512 equivalents
to AVX10/*).  This results in a total of 20+ CRC implementations being
potentially needed to properly optimize all CRCs that someone cares
about for x86.  Besides crc32c, currently only crc32_le and crct10dif
are actually optimized for x86, and they only use PCLMULQDQ, which means
they can be 2-4x slower than what is possible with VPCLMULQDQ.

Fortunately, at a high level the code that is needed for any
[V]PCLMULQDQ based CRC implementation is mostly the same.  Therefore,
this patch introduces an assembly macro that expands into the body of a
[V]PCLMULQDQ based CRC function for a given number of bits (8, 16, 32,
or 64), bit order (LSB or MSB-first), vector length, and AVX level.

The function expects to be passed a constants table, specific to the
polynomial desired, that was generated by the script previously added.
When two CRC variants share the same number of bits and bit order, the
same functions can be reused, with only the constants table differing.

A new C header crc-pclmul-template-glue.h is also added to make it easy
to integrate the new assembly code using a static call.

The result is that it becomes straightforward to wire up a fully
optimized implementation of any CRC-8, CRC-16, CRC-32, or CRC-64 for x86
(except for crc32c, which usually does a bit better with its specialized
instructions).  Later patches will wire up specific CRC variants.

Although this new template allows easily generating many functions, care
was taken to still keep the binary size fairly low.  Each generated
function is only ~520 bytes for LSB CRCs or ~660 bytes for MSB CRCs.

Note that a similar approach should also work for other architectures
that have carryless multiplication instructions, such as arm64.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/lib/crc-pclmul-template-glue.h |  84 ++++
 arch/x86/lib/crc-pclmul-template.S      | 588 ++++++++++++++++++++++++
 2 files changed, 672 insertions(+)
 create mode 100644 arch/x86/lib/crc-pclmul-template-glue.h
 create mode 100644 arch/x86/lib/crc-pclmul-template.S

diff --git a/arch/x86/lib/crc-pclmul-template-glue.h b/arch/x86/lib/crc-pclmul-template-glue.h
new file mode 100644
index 0000000000000..7a8f9175dcc54
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-template-glue.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
+ * instantiated by crc-pclmul-template.S
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#ifndef _CRC_PCLMUL_TEMPLATE_GLUE_H
+#define _CRC_PCLMUL_TEMPLATE_GLUE_H
+
+#include <asm/cpufeatures.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+
+#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t)				\
+crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len,		\
+			  const void *consts_ptr);			\
+crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len,		\
+			    const void *consts_ptr);			\
+crc_t prefix##_vpclmul_avx10_256(crc_t crc, const u8 *p, size_t len,	\
+				 const void *consts_ptr);		\
+crc_t prefix##_vpclmul_avx10_512(crc_t crc, const u8 *p, size_t len,	\
+				 const void *consts_ptr);		\
+									\
+DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
+
+#define INIT_CRC_PCLMUL(prefix)						\
+do {									\
+	if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) &&				\
+	    boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&			\
+	    boot_cpu_has(X86_FEATURE_AVX2) &&				\
+	    cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) {		\
+		if (boot_cpu_has(X86_FEATURE_AVX512BW) &&		\
+		    boot_cpu_has(X86_FEATURE_AVX512VL) &&		\
+		    cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {	\
+			if (boot_cpu_has(X86_FEATURE_PREFER_YMM))	\
+				static_call_update(prefix##_pclmul,	\
+						   prefix##_vpclmul_avx10_256); \
+			else						\
+				static_call_update(prefix##_pclmul,	\
+						   prefix##_vpclmul_avx10_512); \
+		} else {						\
+			static_call_update(prefix##_pclmul,		\
+					   prefix##_vpclmul_avx2);	\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * Call a [V]PCLMULQDQ optimized CRC function if SIMD is usable and the CPU has
+ * PCLMULQDQ support, and the length is not very small.
+ *
+ * The SIMD functions require len >= 16.  However, if the fallback
+ * implementation uses slice-by-8 instead of slice-by-1 (which makes it much
+ * faster, assuming the larger tables stay in dcache...), then roughly len >= 64
+ * is needed for the overhead of the kernel_fpu_{begin,end}() to be worth it.
+ *
+ * (64 is just a rough estimate.  The exact breakeven point varies by factors
+ * such as the CPU model; how many FPU sections are executed before returning to
+ * userspace, considering that only one XSAVE + XRSTOR pair is executed no
+ * matter how many FPU sections there are; whether the userspace thread used ymm
+ * or zmm registers which makes the XSAVE + XRSTOR more expensive; and whether
+ * the thread is a kernel thread, which never needs the XSAVE + XRSTOR.)
+ */
+#define CRC_PCLMUL(crc, p, len, prefix, consts,				\
+		   have_pclmulqdq, is_fallback_sliced)			\
+do {									\
+	if ((len) >= ((is_fallback_sliced) ? 64 : 16) &&		\
+	    static_branch_likely(&(have_pclmulqdq)) &&			\
+	    crypto_simd_usable()) {					\
+		const void *consts_ptr;					\
+									\
+		consts_ptr = (consts).fold_across_128_bits_consts;	\
+		kernel_fpu_begin();					\
+		crc = static_call(prefix##_pclmul)((crc), (p), (len),	\
+						   consts_ptr);		\
+		kernel_fpu_end();					\
+		return crc;						\
+	}								\
+} while (0)
+
+#endif /* _CRC_PCLMUL_TEMPLATE_GLUE_H */
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
new file mode 100644
index 0000000000000..54c6b74b7729b
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-template.S
@@ -0,0 +1,588 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// Template to generate [V]PCLMULQDQ-based CRC functions for x86
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+
+#include <linux/linkage.h>
+
+// Offsets within the generated constants table
+.set OFFSETOF_BSWAP_MASK,			-5*16	// only used for MSB CRC
+.set OFFSETOF_FOLD_ACROSS_2048BIT_CONSTS,	-4*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_1024BIT_CONSTS,	-3*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_512BIT_CONSTS,	-2*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_256BIT_CONSTS,	-1*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_128BIT_CONSTS,	0*16	// must be 0 offset
+.set OFFSETOF_SHUF_TABLE,			1*16
+.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
+.set OFFSETOF_EXTRACT_CRC_MASK,			5*16	// only used for LSB CRC
+
+// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
+// corresponding non-VEX instruction plus any needed moves.  \insn gives the
+// instruction without a "v" prefix and including any immediate argument, and
+// \arg1-\arg3 give up to three non-immediate arguments as expected by the
+// VEX-coded form of the instruction.  If \arg1 is an unaligned mem operand,
+// \unaligned_mem_tmp must be specified as a temporary register.  If \arg3 is
+// given and \arg2 != \arg3, then it is required that \arg1 != \arg3.
+.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
+.if AVX_LEVEL == 0
+  // VEX not allowed.  Emulate it.
+  .ifnb \arg3 // Three arguments
+    .ifc "\arg2", "\arg3" // arg2 == arg3?
+      .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	\insn		\unaligned_mem_tmp, \arg3
+      .else
+	\insn		\arg1, \arg3
+      .endif
+    .else // arg2 != arg3
+      .ifc "\arg1", "\arg3"
+	.error "Can't have arg1 == arg3 when arg2 != arg3"
+      .endif
+      .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	movdqa		\arg2, \arg3
+	\insn		\unaligned_mem_tmp, \arg3
+      .else
+	movdqa		\arg2, \arg3
+	\insn		\arg1, \arg3
+      .endif
+    .endif
+  .else // Only two arguments
+    .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	\insn		\unaligned_mem_tmp, \arg2
+    .else
+	\insn		\arg1, \arg2
+    .endif
+  .endif
+.else
+  // VEX is allowed.  Emit the desired instruction directly.
+  .ifnb \arg3
+	v\insn		\arg1, \arg2, \arg3
+  .else
+	v\insn		\arg1, \arg2
+  .endif
+.endif
+.endm
+
+// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
+// register of length VL.
+.macro	_vbroadcast	src, dst
+.if VL == 16
+	_cond_vex movdqa,	\src, \dst
+.elseif VL == 32
+	vbroadcasti128		\src, \dst
+.else
+	vbroadcasti32x4		\src, \dst
+.endif
+.endm
+
+// Load bytes from the unaligned mem operand \src into \dst, and if the CRC is
+// MSB-first use \bswap_mask to reflect the bytes within each 128-bit lane.
+.macro	_load_data	src, bswap_mask, dst
+.if VL < 64
+	_cond_vex movdqu,	"\src", \dst
+.else
+	vmovdqu8		\src, \dst
+.endif
+.if !LSB_CRC
+	_cond_vex pshufb,	\bswap_mask, \dst, \dst
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc.  \data can be an
+// unaligned mem operand if using VEX is allowed and the CRC is LSB-first so no
+// byte-reflection is needed; otherwise it must be a vector register.  \consts
+// is a vector register containing the needed fold constants, and \tmp is a
+// temporary vector register.  All arguments must be the same length.
+.macro	_fold_vec	acc, data, consts, tmp
+	_cond_vex "pclmulqdq $0x00,",	\consts, \acc, \tmp
+	_cond_vex "pclmulqdq $0x11,",	\consts, \acc, \acc
+.if AVX_LEVEL < 10
+	_cond_vex pxor,	\data, \tmp, \tmp
+	_cond_vex pxor,	\tmp, \acc, \acc
+.else
+	vpternlogq	$0x96, \data, \tmp, \acc
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc.  \data is an
+// unaligned mem operand, \consts is a vector register containing the needed
+// fold constants, \bswap_mask is a vector register containing the
+// byte-reflection table if the CRC is MSB-first and \tmp1 and \tmp2 are
+// temporary vector registers.  All arguments must be the same length.
+.macro	_fold_vec_mem	acc, data, consts, bswap_mask, tmp1, tmp2
+.if AVX_LEVEL == 0 || !LSB_CRC
+	_load_data	\data, \bswap_mask, \tmp1
+	_fold_vec	\acc, \tmp1, \consts, \tmp2
+.else
+	_fold_vec	\acc, \data, \consts, \tmp1
+.endif
+.endm
+
+// Load the constants for folding across 2**i vectors of length VL at a time
+// into all 128-bit lanes of the vector register CONSTS.
+.macro	_load_vec_folding_consts	i
+	_vbroadcast OFFSETOF_FOLD_ACROSS_128BIT_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
+		    CONSTS
+.endm
+
+// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
+// the result back into \v0.  If the remaining length mod \vl is nonzero, also
+// fold \vl bytes from (BUF).  For both operations the fold distance is \vl.
+// \consts must be a register of length \vl containing the fold constants.
+.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
+	_fold_vec	\v0, \v1, \consts, \tmp1
+	test		$\vl, LEN8
+	jz		.Lfold_vec_final_done\@
+	_fold_vec_mem	\v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
+	add		$\vl, BUF
+.Lfold_vec_final_done\@:
+.endm
+
+// This macro generates the body of a CRC function with the following prototype:
+//
+// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
+//
+// |crc| is the initial CRC.  |buf| is the data to checksum.  |len| is the data
+// length in bytes, which must be at least \vl if \vl is 16 or 32, or at least
+// 4*\vl if \vl is 64.  |consts| is a pointer to the fold_across_128_bits_consts
+// field of the constants table that was generated for the chosen CRC variant.
+// crc_t is the smallest unsigned integer data type that can hold a CRC of
+// length \crc_bits, e.g. u32 for a CRC-32.
+//
+// Moving onto the macro parameters, \crc_bits is the number of bits in the CRC,
+// e.g. 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.
+// If the file is compiled in i386 mode, values above 32 are unsupported.
+//
+// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
+// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
+// if the CRC processes the most significant bit of each byte first, i.e. maps
+// bit0 to x^0, bit1 to x^1, bit7 to x^7.
+//
+// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
+//
+// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
+// 10 for AVX10 or AVX512.
+//
+// If \vl == 16 && \avx_level == 0, the generated code requires:
+// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
+//
+// If \vl == 32 && \avx_level == 2, the generated code requires:
+// VPCLMULQDQ && AVX2.
+//
+// If \vl == 32 && \avx_level == 10, the generated code requires:
+// VPCLMULQDQ && (AVX10/256 || (AVX512BW && AVX512VL))
+//
+// If \vl == 64 && \avx_level == 10, the generated code requires:
+// VPCLMULQDQ && (AVX10/512 || (AVX512BW && AVX512VL))
+//
+// Other \vl and \avx_level combinations are either not supported or not useful.
+.macro	_crc_pclmul	crc_bits, lsb_crc, vl, avx_level
+	.set	LSB_CRC,	\lsb_crc
+	.set	VL,		\vl
+	.set	AVX_LEVEL,	\avx_level
+
+	// Define aliases for the xmm, ymm, or zmm registers according to VL.
+.irp i, 0,1,2,3,4,5,6,7
+  .if VL == 16
+	.set	V\i,		%xmm\i
+	.set	LOG2_VL,	4
+  .elseif VL == 32
+	.set	V\i,		%ymm\i
+	.set	LOG2_VL,	5
+  .elseif VL == 64
+	.set	V\i,		%zmm\i
+	.set	LOG2_VL,	6
+  .else
+	.error "Unsupported vector length"
+  .endif
+.endr
+	// Define aliases for the function parameters.
+#ifdef __x86_64__
+	.set	CRC64,		%rdi
+	.set	CRC32,		%edi
+	.set	CRC16,		%di
+	.set	CRC8,		%dil
+	.set	BUF,		%rsi
+	.set	LEN,		%rdx
+	.set	LEN32,		%edx
+	.set	LEN8,		%dl
+	.set	CONSTS_PTR,	%rcx
+#else
+	// 32-bit support, assuming -mregparm=3 and not including support for
+	// CRC-64 (which would use both eax and edx to pass the crc parameter).
+	.set	CRC32,		%eax
+	.set	CRC16,		%ax
+	.set	CRC8,		%al
+	.set	BUF,		%edx
+	.set	LEN,		%ecx
+	.set	LEN32,		%ecx
+	.set	LEN8,		%cl
+	.set	CONSTS_PTR,	%ebx	// Passed on stack
+#endif
+
+	// Define aliases for some local variables.  V0-V5 are used without
+	// aliases (for accumulators, data, temporary values, etc).  Staying
+	// within the first 8 vector registers keeps the code 32-bit SSE
+	// compatible and reduces the size of 64-bit SSE code slightly.
+	.set	BSWAP_MASK,	V6
+	.set	BSWAP_MASK_YMM,	%ymm6
+	.set	BSWAP_MASK_XMM,	%xmm6
+	.set	CONSTS,		V7
+	.set	CONSTS_YMM,	%ymm7
+	.set	CONSTS_XMM,	%xmm7
+
+#ifdef __i386__
+	push		CONSTS_PTR
+	mov		8(%esp), CONSTS_PTR
+#endif
+
+	// Zero-extend the initial CRC if it is shorter than 32 bits.
+.if \crc_bits <= 8
+	movzbl		CRC8, CRC32
+.elseif \crc_bits <= 16
+	movzwl		CRC16, CRC32
+.endif
+
+	// Load the first vector of data and XOR the initial CRC into the end of
+	// it that represents the high-order polynomial coefficients.
+.if LSB_CRC
+  .if \crc_bits <= 32
+	_cond_vex movd,	CRC32, %xmm0
+  .else
+	_cond_vex movq,	CRC64, %xmm0
+  .endif
+  .if VL < 64
+	_cond_vex pxor,	0*VL(BUF), V0, V0, unaligned_mem_tmp=V1
+  .else
+	vpxord		0*VL(BUF), V0, V0
+  .endif
+.else
+	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
+	_load_data	0*VL(BUF), BSWAP_MASK, V0
+	_cond_vex pxor,	%xmm1, %xmm1, %xmm1
+  .if \crc_bits == 8
+	_cond_vex "pinsrb $15,", CRC32, %xmm1, %xmm1
+  .elseif \crc_bits == 16
+	_cond_vex "pinsrw $7,", CRC32, %xmm1, %xmm1
+  .elseif \crc_bits == 32
+	_cond_vex "pinsrd $3,", CRC32, %xmm1, %xmm1
+  .elseif \crc_bits == 64
+	_cond_vex "pinsrq $1,", CRC64, %xmm1, %xmm1
+  .else
+	.error "Unsupported crc_bits: \crc_bits"
+  .endif
+  .if VL < 64
+	_cond_vex pxor,	V1, V0, V0
+  .else
+	vpxord		V1, V0, V0
+  .endif
+.endif
+
+	// Handle VL <= LEN < 4*VL, unless VL=64 in which case the function is
+	// only called for LEN >= 4*VL.
+.if VL != 64
+	cmp		$4*VL-1, LEN
+	ja		.Lfold_4vecs_prepare\@
+
+	add		$VL, BUF
+	cmp		$2*VL-1, LEN32
+	jbe		.Lless_than_2vecs\@
+	_load_data	(BUF), BSWAP_MASK, V1
+	add		$VL, BUF
+	jmp		.Lreduce_2vecs_to_1\@
+.Lless_than_2vecs\@:
+.if VL == 16
+	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128BIT_CONSTS(CONSTS_PTR), CONSTS_XMM
+.endif // Else, the reduction from ymm to xmm will load it.
+	jmp		.Lreduce_1vec_to_128bits\@
+.endif
+
+.Lfold_4vecs_prepare\@:
+	// Load 3 more vectors of data.
+	_load_data	1*VL(BUF), BSWAP_MASK, V1
+	_load_data	2*VL(BUF), BSWAP_MASK, V2
+	_load_data	3*VL(BUF), BSWAP_MASK, V3
+	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
+	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
+
+	// While >= 4 vectors of data remain, fold the 4 vectors V0-V3 into the
+	// next 4 vectors of data and write the result back to V0-V3.
+	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
+	jbe		.Lreduce_4vecs_to_2\@
+	_load_vec_folding_consts	2
+.Lfold_4vecs_loop\@:
+	_fold_vec_mem	V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	sub		$-4*VL, BUF
+	add		$-4*VL, LEN
+	cmp		$4*VL-1, LEN
+	ja		.Lfold_4vecs_loop\@
+
+	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.
+	// Then fold two vectors of data, if at least that much remains.
+.Lreduce_4vecs_to_2\@:
+	_load_vec_folding_consts	1
+	_fold_vec	V0, V2, CONSTS, V4
+	_fold_vec	V1, V3, CONSTS, V4
+	test		$2*VL, LEN8
+	jz		.Lreduce_2vecs_to_1\@
+	_fold_vec_mem	V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	sub		$-2*VL, BUF
+
+	// Fold V0 into V1 and write the result back to V0.
+	// Then fold one vector of data, if at least that much remains.
+.Lreduce_2vecs_to_1\@:
+	_load_vec_folding_consts	0
+	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
+
+.Lreduce_1vec_to_128bits\@:
+	// Reduce V0 to 128 bits xmm0.
+.if VL == 64
+	// zmm0 => ymm0
+	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256BIT_CONSTS(CONSTS_PTR), CONSTS_YMM
+	vextracti64x4	$1, %zmm0, %ymm1
+	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
+.endif
+.if VL >= 32
+	// ymm0 => xmm0
+	vmovdqa		OFFSETOF_FOLD_ACROSS_128BIT_CONSTS(CONSTS_PTR), CONSTS_XMM
+	vextracti128	$1, %ymm0, %xmm1
+	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
+.endif
+
+	and		$15, LEN32
+	jz		.Lpartial_block_done\@
+
+	// 1 <= LEN <= 15 data bytes remain.  The polynomial is now
+	// A*(x^(8*LEN)) + B, where A = xmm0 and B is the polynomial of the
+	// remaining LEN bytes.  To reduce this to 128 bits without needing fold
+	// constants for each possible LEN, rearrange this expression into
+	// C1*(x^128) + C2, where C1 = floor(A / x^(128 - 8*LEN)) and
+	// C2 = A*x^(8*LEN) + B mod x^128.  Then fold C1 into C2, which is just
+	// another fold across 128 bits.
+
+	// Load the last 16 data bytes.
+.if LSB_CRC
+	_load_data	"-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
+.else
+	_load_data	"-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm1
+.endif
+
+	// tmp = A*x^(8*LEN) mod x^128
+.if LSB_CRC
+	// pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1];
+	// i.e. right-shift by LEN bytes
+	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
+	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
+.else
+	// pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN];
+	// i.e. left-shift by LEN bytes
+	neg		LEN
+	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", \
+				%xmm0, %xmm2, unaligned_mem_tmp=%xmm4
+.endif
+
+	// C1 = floor(A / x^(128 - 8*LEN))
+.if LSB_CRC
+	// pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1];
+	// i.e. left-shift by 16-LEN bytes
+	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+0(CONSTS_PTR,LEN)", \
+				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
+.else
+	// pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1];
+	// i.e. right-shift by 16-LEN bytes
+	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+32(CONSTS_PTR,LEN)", \
+				%xmm3
+	_cond_vex pshufb,	%xmm3, %xmm0, %xmm0
+.endif
+
+	// C2 = tmp + B
+	// LSB CRC: blend 1=B,0=tmp by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
+	// MSB CRC: blend 1=tmp,0=B by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
+.if AVX_LEVEL == 0
+	movdqa		%xmm0, %xmm4
+	movdqa		%xmm3, %xmm0
+	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
+	movdqa		%xmm4, %xmm0
+.else
+	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
+.endif
+
+	// Fold C1 into C2 and store the result in xmm0.
+	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
+
+.Lpartial_block_done\@:
+	// Generate the final n-bit CRC from the 128-bit xmm0 = A as follows:
+	//
+	//	crc = x^n * A mod G
+	//	    = x^n * (x^64*A_H + A_L) mod G
+	//	    = x^n * (x^(64-n)*(x^n*A_H mod G) + A_L) mod G
+	//
+	// I.e.:
+	//	crc := 0
+	//	crc := x^n * (x^(64-n)*crc + A_H) mod G
+	//	crc := x^n * (x^(64-n)*crc + A_L) mod G
+	//
+	// A_H and A_L denote the high and low 64 polynomial coefficients in A.
+	//
+	// Using Barrett reduction to do the 'mod G', this becomes:
+	//
+	//	crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
+	//	A_L := x^(64-n)*crc + A_L
+	//	crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
+	//
+	// 'm' must be an integer >= 63 (the max degree of A_L and A_H) for
+	// sufficient precision to be carried through the calculation.  For
+	// an LSB-first CRC we use m == 63, which results in floor(x^(m+n) / G)
+	// being 64-bit which is the most pclmulqdq can accept.  The
+	// multiplication with floor(x^(63+n) / G) then produces a 127-bit
+	// product, and the floored division by x^63 just takes the first qword.
+	// For an MSB-first CRC, we would instead need to take the high 64 bits
+	// of a 127-bit product which is inconvenient, so we use m == 64 in that
+	// case instead and handle multiplying by a 65-bit floor(x^(64+n) / G).
+
+	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), \
+				CONSTS_XMM
+
+	// crc := floor((A_H * floor(x^(m+n) / G)) / x^m) * G mod x^n
+.if LSB_CRC
+	// tmp := floor((A_H * floor(x^(63+n) / G)) / x^63)
+	_cond_vex "pclmulqdq $0x00,",	CONSTS_XMM, %xmm0, %xmm1
+	// tmp is in bits [0:64) of xmm1.
+
+	// crc := tmp * G mod x^n
+  .if \crc_bits == 64
+	// Multiply by 65-bit G.
+	_cond_vex "pclmulqdq $0x10,",	CONSTS_XMM, %xmm1, %xmm2
+	_cond_vex "pshufd $0x4e,",	%xmm1, %xmm1
+	_cond_vex pxor,			%xmm2, %xmm1, %xmm1
+  .else
+	// Multiply by 64-bit or less G.
+	_cond_vex "pclmulqdq $0x10,",	CONSTS_XMM, %xmm1, %xmm1
+  .endif
+	// crc is in bits [64:64+n) of xmm1.
+
+.else
+	// tmp := floor((A_H * floor(x^(64+n) / G)) / x^64)
+	// In this case the constant multiplicand is 65-bit.
+	_cond_vex "pclmulqdq $0x01,",	CONSTS_XMM, %xmm0, %xmm1
+	_cond_vex pxor,			%xmm0, %xmm1, %xmm1
+	// tmp is in bits [64:128) of xmm1.
+
+	// crc := tmp * G mod x^n
+	_cond_vex "pclmulqdq $0x11,",	CONSTS_XMM, %xmm1, %xmm1
+	// crc is in bits [0:n) of xmm1.
+.endif
+
+	// A_L := x^(64-n)*crc + A_L
+.if LSB_CRC
+	// For LSB CRCs, crc is already aligned to add (XOR) it directly to A_L.
+	// If \crc_bits < 64, it needs to be selected using a mask.
+  .if \crc_bits == 64
+	_cond_vex pxor,	%xmm1, %xmm0, %xmm0
+  .elseif AVX_LEVEL >= 10
+	vpternlogq	$0x78, OFFSETOF_EXTRACT_CRC_MASK(CONSTS_PTR), %xmm1, %xmm0
+  .else
+	_cond_vex pand,	OFFSETOF_EXTRACT_CRC_MASK(CONSTS_PTR), %xmm1, %xmm1
+	_cond_vex pxor,	%xmm1, %xmm0, %xmm0
+  .endif
+.else
+  .if \crc_bits != 64
+	_cond_vex psllq,	$64-\crc_bits, %xmm1, %xmm1
+  .endif
+	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
+.endif
+
+	// crc := floor((A_L * floor(x^(m+n) / G)) / x^m) * G mod x^n
+	//
+	// Same as previous but uses the low-order 64 coefficients of A.
+.if LSB_CRC
+	_cond_vex "pclmulqdq $0x01,",	CONSTS_XMM, %xmm0, %xmm0
+  .if \crc_bits == 64
+	_cond_vex "pclmulqdq $0x10,",	CONSTS_XMM, %xmm0, %xmm1
+	_cond_vex "pshufd $0x4e,",	%xmm0, %xmm0
+	_cond_vex pxor,			%xmm1, %xmm0, %xmm0
+  .else
+	_cond_vex "pclmulqdq $0x10,",	CONSTS_XMM, %xmm0, %xmm0
+  .endif
+.else
+	_cond_vex "pclmulqdq $0x00,",	CONSTS_XMM, %xmm0, %xmm1
+	_cond_vex "pshufd $0x4e,",	%xmm0, %xmm0
+	_cond_vex pxor,			%xmm1, %xmm0, %xmm0
+	_cond_vex "pclmulqdq $0x11,",	CONSTS_XMM, %xmm0, %xmm0
+.endif
+
+.if LSB_CRC
+	// Extract the CRC from bits [64:64+n) of xmm0.
+  .if \crc_bits == 8
+	_cond_vex "pextrb $8,", %xmm0, %eax
+  .elseif \crc_bits == 16
+	_cond_vex "pextrw $4,", %xmm0, %eax
+  .elseif \crc_bits == 32
+	_cond_vex "pextrd $2,", %xmm0, %eax
+  .elseif \crc_bits == 64
+	_cond_vex "pextrq $1,", %xmm0, %rax
+  .else
+	.error "Unsupported crc_bits: \crc_bits"
+  .endif
+.else
+	// Extract the CRC from bits [0:n) of xmm0.
+  .if \crc_bits == 8
+	_cond_vex "pextrb $0,", %xmm0, %eax
+  .elseif \crc_bits == 16
+	_cond_vex "pextrw $0,", %xmm0, %eax
+  .elseif \crc_bits == 32
+	_cond_vex movd,		%xmm0, %eax
+  .elseif \crc_bits == 64
+	_cond_vex movq,		%xmm0, %rax
+  .else
+	.error "Unsupported crc_bits: \crc_bits"
+  .endif
+.endif
+
+.if VL > 16
+	vzeroupper	// Needed when ymm or zmm registers were used.
+.endif
+#ifdef __i386__
+	pop		CONSTS_PTR
+#endif
+	RET
+.endm
+
+#ifdef CONFIG_AS_VPCLMULQDQ
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
+SYM_FUNC_START(prefix##_pclmul_sse);					\
+	_crc_pclmul	crc_bits=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
+SYM_FUNC_END(prefix##_pclmul_sse);					\
+									\
+SYM_FUNC_START(prefix##_vpclmul_avx2);					\
+	cmp		$32, LEN;					\
+	jb		prefix##_pclmul_sse;				\
+	_crc_pclmul	crc_bits=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
+SYM_FUNC_END(prefix##_vpclmul_avx2);					\
+									\
+SYM_FUNC_START(prefix##_vpclmul_avx10_256);				\
+	cmp		$32, LEN;					\
+	jb		prefix##_pclmul_sse;				\
+	_crc_pclmul	crc_bits=bits, lsb_crc=lsb, vl=32, avx_level=10;\
+SYM_FUNC_END(prefix##_vpclmul_avx10_256);				\
+									\
+SYM_FUNC_START(prefix##_vpclmul_avx10_512);				\
+	cmp		$256, LEN;					\
+	jb		prefix##_vpclmul_avx10_256;			\
+	_crc_pclmul	crc_bits=bits, lsb_crc=lsb, vl=64, avx_level=10;\
+SYM_FUNC_END(prefix##_vpclmul_avx10_512);
+#else
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
+SYM_FUNC_START(prefix##_pclmul_sse);					\
+	_crc_pclmul	crc_bits=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
+SYM_FUNC_END(prefix##_pclmul_sse);
+#endif // !CONFIG_AS_VPCLMULQDQ

From patchwork Mon Nov 25 04:11:27 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884376
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id CCE1685270;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507969; cv=none;
 b=mSnnm6g2TmZ1rrCsDpQX3qaSicYphPwFnXCynuPbop30m+KiyYsDKJfd9TVQrQxG2HigUtk73ETNqR/wdN0wHjy2Wk4On98E0aiiqqCRw7ZImmeueRV9fGNpuQRRBr4723bcv1FjpkfQxmfl020SfWjVn8dE8Nv721vXHHJT4uY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507969; c=relaxed/simple;
	bh=75I+ZyoNc9JYHHqQuax90nCiaXQqm5LrwpPROmSUBDs=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=T6XcA56+4N1XHqs5Xh3GdgC1Uz+wKRt/1Dm8PZPYxcTU8clMT137E3tUj33p/SNZ6Q89efX/KOdi6z5pYmoXS6lsOJvetU4psYs2qPhIxb9mBK6ImwAwfxo/HAUdjRCNGgKlvxCycEPkBzIAmUouHB/VnkqwW9dKoHKyNYYqWE0=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=DESdya/i; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="DESdya/i"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7D7BBC4CED8;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507969;
	bh=75I+ZyoNc9JYHHqQuax90nCiaXQqm5LrwpPROmSUBDs=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=DESdya/iPQjek2XgyFBguzF6I0CGpC9l7lSwPcXttQhPbRaLgddnHGnuT//rPAXqY
	 v0C59fz9tWt5RKoi5UgXXEjByhR1OLsmftPa1N3JvpRzp7ay1cispW2Eb15unTKhcR
	 vhAomEighAsEvqtZ6I/l4FOYaq96mKTJ2x7qJqLbdOCALz/TQ3A1BXOlVyHofM7VOE
	 RzHoS2cn066nRcQZQXrQ1eKInHLZEI+Unno6alpO1RfrGCbe/7Rqk/IkdHilgXcjkX
	 lLwe38nhFyzve8pWLcQm9tyW/8WeqMk6l7sboe5oQXU3oqSIR7cXJin+EzBQlYi8Zb
	 D5hMVTJTQx/fw==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 4/6] x86/crc32: implement crc32_le using new template
Date: Sun, 24 Nov 2024 20:11:27 -0800
Message-ID: <20241125041129.192999-5-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Instantiate crc-pclmul-template.S for crc32_le, and delete the original
PCLMULQDQ optimized implementation.  This has the following advantages:

- Less CRC-variant-specific code.
- Full VPCLMULQDQ support, improving performance on newer CPUs.
- A faster final reduction from 128 bits.
- Support for lengths not a multiple of 16 bytes, improving performance
  for such lengths.
- Support for misaligned buffers, improving performance in such cases.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

	Length     Before        After
	------     ------        -----
	     1     274 MB/s      302 MB/s
	    16    3004 MB/s     2576 MB/s
	    64    3077 MB/s     7406 MB/s
	   127    2791 MB/s     9768 MB/s
	   128    8366 MB/s    12878 MB/s
	   200   10010 MB/s    15320 MB/s
	   256   16018 MB/s    22664 MB/s
	   511   11809 MB/s    27646 MB/s
	   512   18473 MB/s    48241 MB/s
	  1024   20002 MB/s    63912 MB/s
	  3173   20907 MB/s    78547 MB/s
	  4096   21291 MB/s    82808 MB/s
	 16384   21304 MB/s    83350 MB/s

The above results compare the state directly before this patch to the
state after this patch.  Note that performance was also improved earlier
by eliminating the indirect call associated with the crypto API.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/lib/crc-pclmul-consts.h |  55 ++++++++
 arch/x86/lib/crc32-glue.c        |  34 ++---
 arch/x86/lib/crc32-pclmul.S      | 219 +------------------------------
 3 files changed, 68 insertions(+), 240 deletions(-)
 create mode 100644 arch/x86/lib/crc-pclmul-consts.h

diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
new file mode 100644
index 0000000000000..ed9a625cdd2a6
--- /dev/null
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ *	./scripts/crc/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320
+ *
+ * Do not edit manually.
+ */
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ *        x^5 + x^4 + x^2 + x + 1
+ */
+static const struct {
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+	u64 extract_crc_mask[2];
+} crc32_lsb_0xedb88320_consts __cacheline_aligned __maybe_unused = {
+	.fold_across_2048_bits_consts = {
+		0xce3371cb,	/* x^(2048+64-33) mod G(x) */
+		0xe95c1271,	/* x^(2048+0-33) mod G(x) */
+	},
+	.fold_across_1024_bits_consts = {
+		0x33fff533,	/* x^(1024+64-33) mod G(x) */
+		0x910eeec1,	/* x^(1024+0-33) mod G(x) */
+	},
+	.fold_across_512_bits_consts = {
+		0x8f352d95,	/* x^(512+64-33) mod G(x) */
+		0x1d9513d7,	/* x^(512+0-33) mod G(x) */
+	},
+	.fold_across_256_bits_consts = {
+		0xf1da05aa,	/* x^(256+64-33) mod G(x) */
+		0x81256527,	/* x^(256+0-33) mod G(x) */
+	},
+	.fold_across_128_bits_consts = {
+		0xae689191,	/* x^(128+64-33) mod G(x) */
+		0xccaa009e,	/* x^(128+0-33) mod G(x) */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0xb4e5b025f7011641,	/* floor(x^95 / G(x)) */
+		0x1db710641,	/* G(x) */
+	},
+	.extract_crc_mask = {0, 0xffffffff},
+};
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index 2dd18a886ded8..afcdeee429664 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -11,41 +11,23 @@
 #include <asm/simd.h>
 #include <crypto/internal/simd.h>
 #include <linux/crc32.h>
 #include <linux/linkage.h>
 #include <linux/module.h>
-
-/* minimum size of buffer for crc32_pclmul_le_16 */
-#define CRC32_PCLMUL_MIN_LEN	64
+#include "crc-pclmul-consts.h"
+#include "crc-pclmul-template-glue.h"
 
 static DEFINE_STATIC_KEY_FALSE(have_crc32);
 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
-u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
 
 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
-	if (len >= CRC32_PCLMUL_MIN_LEN + 15 &&
-	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
-		size_t n = -(uintptr_t)p & 15;
-
-		/* align p to 16-byte boundary */
-		if (n) {
-			crc = crc32_le_base(crc, p, n);
-			p += n;
-			len -= n;
-		}
-		n = round_down(len, 16);
-		kernel_fpu_begin();
-		crc = crc32_pclmul_le_16(crc, p, n);
-		kernel_fpu_end();
-		p += n;
-		len -= n;
-	}
-	if (len)
-		crc = crc32_le_base(crc, p, len);
-	return crc;
+	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+		   have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
+	return crc32_le_base(crc, p, len);
 }
 EXPORT_SYMBOL(crc32_le_arch);
 
 #ifdef CONFIG_X86_64
 #define CRC32_INST "crc32q %1, %q0"
@@ -95,12 +77,14 @@ EXPORT_SYMBOL(crc32_be_arch);
 
 static int __init crc32_x86_init(void)
 {
 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
 		static_branch_enable(&have_crc32);
-	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
+		INIT_CRC_PCLMUL(crc32_lsb);
+	}
 	return 0;
 }
 arch_initcall(crc32_x86_init);
 
 static void __exit crc32_x86_exit(void)
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
index f9637789cac19..cf07d571ae864 100644
--- a/arch/x86/lib/crc32-pclmul.S
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -1,217 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2024 Google LLC
 
-#include <linux/linkage.h>
+#include "crc-pclmul-template.S"
 
-
-.section .rodata
-.align 16
-/*
- * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
- * #define CONSTANT_R1  0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
- * #define CONSTANT_R2  0x1c6e41596LL
- */
-.Lconstant_R2R1:
-	.octa 0x00000001c6e415960000000154442bd4
-/*
- * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
- * #define CONSTANT_R3  0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
- * #define CONSTANT_R4  0x0ccaa009eLL
- */
-.Lconstant_R4R3:
-	.octa 0x00000000ccaa009e00000001751997d0
-/*
- * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
- * #define CONSTANT_R5  0x163cd6124LL
- */
-.Lconstant_R5:
-	.octa 0x00000000000000000000000163cd6124
-.Lconstant_mask32:
-	.octa 0x000000000000000000000000FFFFFFFF
-/*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
- * #define CONSTANT_RU  0x1F7011641LL
- */
-.Lconstant_RUpoly:
-	.octa 0x00000001F701164100000001DB710641
-
-#define CONSTANT %xmm0
-
-#ifdef __x86_64__
-#define CRC     %edi
-#define BUF     %rsi
-#define LEN     %rdx
-#else
-#define CRC     %eax
-#define BUF     %edx
-#define LEN     %ecx
-#endif
-
-
-
-.text
-/**
- *      Calculate crc32
- *      CRC - initial crc32
- *      BUF - buffer (16 bytes aligned)
- *      LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63
- *      return %eax crc32
- *      u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
- */
-
-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
-	movdqa  (BUF), %xmm1
-	movdqa  0x10(BUF), %xmm2
-	movdqa  0x20(BUF), %xmm3
-	movdqa  0x30(BUF), %xmm4
-	movd    CRC, CONSTANT
-	pxor    CONSTANT, %xmm1
-	sub     $0x40, LEN
-	add     $0x40, BUF
-	cmp     $0x40, LEN
-	jb      .Lless_64
-
-#ifdef __x86_64__
-	movdqa .Lconstant_R2R1(%rip), CONSTANT
-#else
-	movdqa .Lconstant_R2R1, CONSTANT
-#endif
-
-.Lloop_64:/*  64 bytes Full cache line folding */
-	prefetchnta    0x40(BUF)
-	movdqa  %xmm1, %xmm5
-	movdqa  %xmm2, %xmm6
-	movdqa  %xmm3, %xmm7
-#ifdef __x86_64__
-	movdqa  %xmm4, %xmm8
-#endif
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pclmulqdq $0x00, CONSTANT, %xmm2
-	pclmulqdq $0x00, CONSTANT, %xmm3
-#ifdef __x86_64__
-	pclmulqdq $0x00, CONSTANT, %xmm4
-#endif
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pclmulqdq $0x11, CONSTANT, %xmm6
-	pclmulqdq $0x11, CONSTANT, %xmm7
-#ifdef __x86_64__
-	pclmulqdq $0x11, CONSTANT, %xmm8
-#endif
-	pxor    %xmm5, %xmm1
-	pxor    %xmm6, %xmm2
-	pxor    %xmm7, %xmm3
-#ifdef __x86_64__
-	pxor    %xmm8, %xmm4
-#else
-	/* xmm8 unsupported for x32 */
-	movdqa  %xmm4, %xmm5
-	pclmulqdq $0x00, CONSTANT, %xmm4
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm4
-#endif
-
-	pxor    (BUF), %xmm1
-	pxor    0x10(BUF), %xmm2
-	pxor    0x20(BUF), %xmm3
-	pxor    0x30(BUF), %xmm4
-
-	sub     $0x40, LEN
-	add     $0x40, BUF
-	cmp     $0x40, LEN
-	jge     .Lloop_64
-.Lless_64:/*  Folding cache line into 128bit */
-#ifdef __x86_64__
-	movdqa  .Lconstant_R4R3(%rip), CONSTANT
-#else
-	movdqa  .Lconstant_R4R3, CONSTANT
-#endif
-	prefetchnta     (BUF)
-
-	movdqa  %xmm1, %xmm5
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm2, %xmm1
-
-	movdqa  %xmm1, %xmm5
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm3, %xmm1
-
-	movdqa  %xmm1, %xmm5
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    %xmm4, %xmm1
-
-	cmp     $0x10, LEN
-	jb      .Lfold_64
-.Lloop_16:/* Folding rest buffer into 128bit */
-	movdqa  %xmm1, %xmm5
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pclmulqdq $0x11, CONSTANT, %xmm5
-	pxor    %xmm5, %xmm1
-	pxor    (BUF), %xmm1
-	sub     $0x10, LEN
-	add     $0x10, BUF
-	cmp     $0x10, LEN
-	jge     .Lloop_16
-
-.Lfold_64:
-	/* perform the last 64 bit fold, also adds 32 zeroes
-	 * to the input stream */
-	pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
-	psrldq  $0x08, %xmm1
-	pxor    CONSTANT, %xmm1
-
-	/* final 32-bit fold */
-	movdqa  %xmm1, %xmm2
-#ifdef __x86_64__
-	movdqa  .Lconstant_R5(%rip), CONSTANT
-	movdqa  .Lconstant_mask32(%rip), %xmm3
-#else
-	movdqa  .Lconstant_R5, CONSTANT
-	movdqa  .Lconstant_mask32, %xmm3
-#endif
-	psrldq  $0x04, %xmm2
-	pand    %xmm3, %xmm1
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pxor    %xmm2, %xmm1
-
-	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
-#ifdef __x86_64__
-	movdqa  .Lconstant_RUpoly(%rip), CONSTANT
-#else
-	movdqa  .Lconstant_RUpoly, CONSTANT
-#endif
-	movdqa  %xmm1, %xmm2
-	pand    %xmm3, %xmm1
-	pclmulqdq $0x10, CONSTANT, %xmm1
-	pand    %xmm3, %xmm1
-	pclmulqdq $0x00, CONSTANT, %xmm1
-	pxor    %xmm2, %xmm1
-	pextrd  $0x01, %xmm1, %eax
-
-	RET
-SYM_FUNC_END(crc32_pclmul_le_16)
+DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)

From patchwork Mon Nov 25 04:11:28 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884377
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0EF2713D246;
	Mon, 25 Nov 2024 04:12:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507970; cv=none;
 b=q4ti/3R0iJ/1FDjQ8ykqNZ01XYPtYVBDCI9Dpzu62hKCI19ZNYr030WftDjALqdaifpa/NKvvDY2rI8sjwTRfQbC60fXYF0REmUGxwvvs9iN42GEP6wXhzIccZvuz7Mnk6Kd1I7bzAdX8oIXYrhRkHhdTXwgE+QTvex/fulPSTE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507970; c=relaxed/simple;
	bh=/bxG6+lbFvWf+zH9N1zOGi0WwMBTFvamMdFIz/4uAV0=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=TJYKrVf2t8aMBjx8BzXbxP92trqT1775TAh0/xkDDOQ60pqDkYzBxIvpzA+QXWa2T80AcF5NqbyHbxyM9GP3/0ND7jW/G3VP5n2IYdwrbTOKvTMdWq2dja1EqsVmJJuT/2apqCSgGRiVldFO+AcuFO5EPSHqP9FoKhnXGKJnEjY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=bS/bXy6r; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="bS/bXy6r"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C267BC4CEDB;
	Mon, 25 Nov 2024 04:12:49 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507969;
	bh=/bxG6+lbFvWf+zH9N1zOGi0WwMBTFvamMdFIz/4uAV0=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=bS/bXy6rLDbhD3edmbBCZb0AZWGOAna1WHW54xw106Sr6Oz8Bn1CciVwD4jZTiiQS
	 Cdovyam7hLspYkBNVmyhSaGeUiv4xuWC3VlF7h6RnhuHIWcsz63d1w+tDlge9aq9gj
	 4tmIYizlQfzHHTY1Lzu3S13IqcXbTlaPVkErcYQyeW+qIaQpLyxWAhUTqIz0FbQ3ct
	 Hnob8MLxhQbbPj+douTHEftGk6yzfLr/6ZOfDQmuK4KPnQqGqIuUEJAUqNx3ZSxd0Q
	 8P1riWuq1gVhNDiTXFrT1qmMUuEZo2A3okthML4+RZzkaXR1S+zI9tUAsRO+6Lulc9
	 llWuaZ5ZTxEVA==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 5/6] x86/crc-t10dif: implement crc_t10dif using new template
Date: Sun, 24 Nov 2024 20:11:28 -0800
Message-ID: <20241125041129.192999-6-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

Instantiate crc-pclmul-template.S for crc_t10dif and delete the original
PCLMULQDQ optimized implementation.  This has the following advantages:

- Less CRC-variant-specific code.
- Full VPCLMULQDQ support, improving performance on newer CPUs.
- A faster final reduction from 128 bits.
- Support for i386.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

    	Length     Before        After
	------     ------        -----
	     1     435 MB/s      509 MB/s
	    16    1878 MB/s     1852 MB/s
	    64    4336 MB/s     6123 MB/s
	   127    5394 MB/s     8431 MB/s
	   128    5590 MB/s    10886 MB/s
	   200    5948 MB/s    13371 MB/s
	   256   15813 MB/s    19428 MB/s
	   511   14193 MB/s    25177 MB/s
	   512   18136 MB/s    35188 MB/s
	  1024   19878 MB/s    62569 MB/s
	  3173   21009 MB/s    77916 MB/s
	  4096   21249 MB/s    82295 MB/s
	 16384   21645 MB/s    89350 MB/s

The above results compare the state directly before this patch to the
state after this patch.  Note that performance was also improved earlier
by eliminating the indirect call associated with the crypto API.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/Kconfig                    |   2 +-
 arch/x86/lib/Makefile               |   2 +-
 arch/x86/lib/crc-pclmul-consts.h    |  48 +++-
 arch/x86/lib/crc-t10dif-glue.c      |  22 +-
 arch/x86/lib/crc16-msb-pclmul.S     |   6 +
 arch/x86/lib/crct10dif-pcl-asm_64.S | 332 ----------------------------
 6 files changed, 65 insertions(+), 347 deletions(-)
 create mode 100644 arch/x86/lib/crc16-msb-pclmul.S
 delete mode 100644 arch/x86/lib/crct10dif-pcl-asm_64.S

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 687221b7ea851..3a47ef91ef177 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,11 +75,11 @@ config X86
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
 	select ARCH_HAS_CPU_FINALIZE_INIT
 	select ARCH_HAS_CPU_PASID		if IOMMU_SVA
 	select ARCH_HAS_CRC32
-	select ARCH_HAS_CRC_T10DIF		if X86_64
+	select ARCH_HAS_CRC_T10DIF
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE	if !X86_PAE
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_DMA_OPS			if GART_IOMMU || XEN
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 8a59c61624c2f..08496e221a7d1 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -41,11 +41,11 @@ lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
 obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
 crc32-x86-y := crc32-glue.o crc32-pclmul.o
 crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
 
 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o
+crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 obj-y += iomem.o
 
 ifeq ($(CONFIG_X86_32),y)
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
index ed9a625cdd2a6..c3ca689eae3b8 100644
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -1,14 +1,60 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * CRC constants generated by:
  *
- *	./scripts/crc/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320
+ *	./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
  *
  * Do not edit manually.
  */
 
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ */
+static const struct {
+	u8 bswap_mask[16];
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc16_msb_0x8bb7_consts __cacheline_aligned __maybe_unused = {
+	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+	.fold_across_2048_bits_consts = {
+		0x22c6,	/* x^(2048+0) mod G(x) */
+		0x9f16,	/* x^(2048+64) mod G(x) */
+	},
+	.fold_across_1024_bits_consts = {
+		0x6123,	/* x^(1024+0) mod G(x) */
+		0x2295,	/* x^(1024+64) mod G(x) */
+	},
+	.fold_across_512_bits_consts = {
+		0x1069,	/* x^(512+0) mod G(x) */
+		0xdd31,	/* x^(512+64) mod G(x) */
+	},
+	.fold_across_256_bits_consts = {
+		0x857d,	/* x^(256+0) mod G(x) */
+		0x7acc,	/* x^(256+64) mod G(x) */
+	},
+	.fold_across_128_bits_consts = {
+		0xa010,	/* x^(128+0) mod G(x) */
+		0x1faa,	/* x^(128+64) mod G(x) */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0xf65a57f81d33a48a,	/* floor(x^80 / G(x)) - x^64 */
+		0x18bb7,	/* G(x) */
+	},
+};
+
 /*
  * CRC folding constants generated for least-significant-bit-first CRC-32 using
  * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
  *        x^5 + x^4 + x^2 + x + 1
  */
diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c
index 13f07ddc9122c..f4cfffdb19ceb 100644
--- a/arch/x86/lib/crc-t10dif-glue.c
+++ b/arch/x86/lib/crc-t10dif-glue.c
@@ -1,39 +1,37 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * CRC-T10DIF using PCLMULQDQ instructions
+ * CRC-T10DIF using [V]PCLMULQDQ instructions
  *
  * Copyright 2024 Google LLC
  */
 
 #include <asm/cpufeatures.h>
 #include <asm/simd.h>
-#include <crypto/internal/simd.h>
 #include <linux/crc-t10dif.h>
 #include <linux/module.h>
+#include "crc-pclmul-consts.h"
+#include "crc-pclmul-template-glue.h"
 
 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
-asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
+DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
 
 u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
 {
-	if (len >= 16 &&
-	    static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		crc = crc_t10dif_pcl(crc, p, len);
-		kernel_fpu_end();
-		return crc;
-	}
+	CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
+		   have_pclmulqdq, false);
 	return crc_t10dif_generic(crc, p, len);
 }
 EXPORT_SYMBOL(crc_t10dif_arch);
 
 static int __init crc_t10dif_x86_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
+		INIT_CRC_PCLMUL(crc16_msb);
+	}
 	return 0;
 }
 arch_initcall(crc_t10dif_x86_init);
 
 static void __exit crc_t10dif_x86_exit(void)
@@ -45,7 +43,7 @@ bool crc_t10dif_is_optimized(void)
 {
 	return static_key_enabled(&have_pclmulqdq);
 }
 EXPORT_SYMBOL(crc_t10dif_is_optimized);
 
-MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions");
+MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
 MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
new file mode 100644
index 0000000000000..14e90ce199314
--- /dev/null
+++ b/arch/x86/lib/crc16-msb-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2024 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S
deleted file mode 100644
index 5286db5b8165e..0000000000000
--- a/arch/x86/lib/crct10dif-pcl-asm_64.S
+++ /dev/null
@@ -1,332 +0,0 @@
-########################################################################
-# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-#
-# Copyright (c) 2013, Intel Corporation
-#
-# Authors:
-#     Erdinc Ozturk <erdinc.ozturk@intel.com>
-#     Vinodh Gopal <vinodh.gopal@intel.com>
-#     James Guilford <james.guilford@intel.com>
-#     Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses.  You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-#
-# * Neither the name of the Intel Corporation nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#       Reference paper titled "Fast CRC Computation for Generic
-#	Polynomials Using PCLMULQDQ Instruction"
-#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-#
-
-#include <linux/linkage.h>
-
-.text
-
-#define		init_crc	%edi
-#define		buf		%rsi
-#define		len		%rdx
-
-#define		FOLD_CONSTS	%xmm10
-#define		BSWAP_MASK	%xmm11
-
-# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
-# reg1, reg2.
-.macro	fold_32_bytes	offset, reg1, reg2
-	movdqu	\offset(buf), %xmm9
-	movdqu	\offset+16(buf), %xmm12
-	pshufb	BSWAP_MASK, %xmm9
-	pshufb	BSWAP_MASK, %xmm12
-	movdqa	\reg1, %xmm8
-	movdqa	\reg2, %xmm13
-	pclmulqdq	$0x00, FOLD_CONSTS, \reg1
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm8
-	pclmulqdq	$0x00, FOLD_CONSTS, \reg2
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm13
-	pxor	%xmm9 , \reg1
-	xorps	%xmm8 , \reg1
-	pxor	%xmm12, \reg2
-	xorps	%xmm13, \reg2
-.endm
-
-# Fold src_reg into dst_reg.
-.macro	fold_16_bytes	src_reg, dst_reg
-	movdqa	\src_reg, %xmm8
-	pclmulqdq	$0x11, FOLD_CONSTS, \src_reg
-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
-	pxor	%xmm8, \dst_reg
-	xorps	\src_reg, \dst_reg
-.endm
-
-#
-# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
-#
-# Assumes len >= 16.
-#
-SYM_FUNC_START(crc_t10dif_pcl)
-
-	movdqa	.Lbswap_mask(%rip), BSWAP_MASK
-
-	# For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp	$256, len
-	jl	.Lless_than_256_bytes
-
-	# Load the first 128 data bytes.  Byte swapping is necessary to make the
-	# bit order match the polynomial coefficient order.
-	movdqu	16*0(buf), %xmm0
-	movdqu	16*1(buf), %xmm1
-	movdqu	16*2(buf), %xmm2
-	movdqu	16*3(buf), %xmm3
-	movdqu	16*4(buf), %xmm4
-	movdqu	16*5(buf), %xmm5
-	movdqu	16*6(buf), %xmm6
-	movdqu	16*7(buf), %xmm7
-	add	$128, buf
-	pshufb	BSWAP_MASK, %xmm0
-	pshufb	BSWAP_MASK, %xmm1
-	pshufb	BSWAP_MASK, %xmm2
-	pshufb	BSWAP_MASK, %xmm3
-	pshufb	BSWAP_MASK, %xmm4
-	pshufb	BSWAP_MASK, %xmm5
-	pshufb	BSWAP_MASK, %xmm6
-	pshufb	BSWAP_MASK, %xmm7
-
-	# XOR the first 16 data *bits* with the initial CRC value.
-	pxor	%xmm8, %xmm8
-	pinsrw	$7, init_crc, %xmm8
-	pxor	%xmm8, %xmm0
-
-	movdqa	.Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
-
-	# Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	# 128 to simplify the termination condition of the following loop.
-	sub	$256, len
-
-	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
-	# bytes xmm0-7 into them, storing the result back into xmm0-7.
-.Lfold_128_bytes_loop:
-	fold_32_bytes	0, %xmm0, %xmm1
-	fold_32_bytes	32, %xmm2, %xmm3
-	fold_32_bytes	64, %xmm4, %xmm5
-	fold_32_bytes	96, %xmm6, %xmm7
-	add	$128, buf
-	sub	$128, len
-	jge	.Lfold_128_bytes_loop
-
-	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
-
-	# Fold across 64 bytes.
-	movdqa	.Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
-	fold_16_bytes	%xmm0, %xmm4
-	fold_16_bytes	%xmm1, %xmm5
-	fold_16_bytes	%xmm2, %xmm6
-	fold_16_bytes	%xmm3, %xmm7
-	# Fold across 32 bytes.
-	movdqa	.Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
-	fold_16_bytes	%xmm4, %xmm6
-	fold_16_bytes	%xmm5, %xmm7
-	# Fold across 16 bytes.
-	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
-	fold_16_bytes	%xmm6, %xmm7
-
-	# Add 128 to get the correct number of data bytes remaining in 0...127
-	# (not counting xmm7), following the previous extra subtraction by 128.
-	# Then subtract 16 to simplify the termination condition of the
-	# following loop.
-	add	$128-16, len
-
-	# While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
-	# xmm7 into them, storing the result back into xmm7.
-	jl	.Lfold_16_bytes_loop_done
-.Lfold_16_bytes_loop:
-	movdqa	%xmm7, %xmm8
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
-	pxor	%xmm8, %xmm7
-	movdqu	(buf), %xmm0
-	pshufb	BSWAP_MASK, %xmm0
-	pxor	%xmm0 , %xmm7
-	add	$16, buf
-	sub	$16, len
-	jge	.Lfold_16_bytes_loop
-
-.Lfold_16_bytes_loop_done:
-	# Add 16 to get the correct number of data bytes remaining in 0...15
-	# (not counting xmm7), following the previous extra subtraction by 16.
-	add	$16, len
-	je	.Lreduce_final_16_bytes
-
-.Lhandle_partial_segment:
-	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
-	# bytes are in xmm7 and the rest are the remaining data in 'buf'.  To do
-	# this without needing a fold constant for each possible 'len', redivide
-	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
-	# bytes, then fold the first chunk into the second.
-
-	movdqa	%xmm7, %xmm2
-
-	# xmm1 = last 16 original data bytes
-	movdqu	-16(buf, len), %xmm1
-	pshufb	BSWAP_MASK, %xmm1
-
-	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
-	lea	.Lbyteshift_table+16(%rip), %rax
-	sub	len, %rax
-	movdqu	(%rax), %xmm0
-	pshufb	%xmm0, %xmm2
-
-	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
-	pxor	.Lmask1(%rip), %xmm0
-	pshufb	%xmm0, %xmm7
-
-	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
-	# then '16-len' bytes from xmm2 (high-order bytes).
-	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
-
-	# Fold the first chunk into the second chunk, storing the result in xmm7.
-	movdqa	%xmm7, %xmm8
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
-	pxor	%xmm8, %xmm7
-	pxor	%xmm1, %xmm7
-
-.Lreduce_final_16_bytes:
-	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
-
-	# Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	movdqa	.Lfinal_fold_consts(%rip), FOLD_CONSTS
-
-	# Fold the high 64 bits into the low 64 bits, while also multiplying by
-	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	# whose low 48 bits are 0.
-	movdqa	%xmm7, %xmm0
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
-	pslldq	$8, %xmm0
-	pxor	%xmm0, %xmm7			  # + low bits * x^64
-
-	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	movdqa	%xmm7, %xmm0
-	pand	.Lmask2(%rip), %xmm0		  # zero high 32 bits
-	psrldq	$12, %xmm7			  # extract high 32 bits
-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
-	pxor	%xmm0, %xmm7			  # + low bits
-
-	# Load G(x) and floor(x^48 / G(x)).
-	movdqa	.Lbarrett_reduction_consts(%rip), FOLD_CONSTS
-
-	# Use Barrett reduction to compute the final CRC value.
-	movdqa	%xmm7, %xmm0
-	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
-	psrlq	$32, %xmm7			  # /= x^32
-	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # *= G(x)
-	psrlq	$48, %xmm0
-	pxor	%xmm7, %xmm0		     # + low 16 nonzero bits
-	# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
-
-	pextrw	$0, %xmm0, %eax
-	RET
-
-.align 16
-.Lless_than_256_bytes:
-	# Checksumming a buffer of length 16...255 bytes
-
-	# Load the first 16 data bytes.
-	movdqu	(buf), %xmm7
-	pshufb	BSWAP_MASK, %xmm7
-	add	$16, buf
-
-	# XOR the first 16 data *bits* with the initial CRC value.
-	pxor	%xmm0, %xmm0
-	pinsrw	$7, init_crc, %xmm0
-	pxor	%xmm0, %xmm7
-
-	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
-	cmp	$16, len
-	je	.Lreduce_final_16_bytes		# len == 16
-	sub	$32, len
-	jge	.Lfold_16_bytes_loop		# 32 <= len <= 255
-	add	$16, len
-	jmp	.Lhandle_partial_segment	# 17 <= len <= 31
-SYM_FUNC_END(crc_t10dif_pcl)
-
-.section	.rodata, "a", @progbits
-.align 16
-
-# Fold constants precomputed from the polynomial 0x18bb7
-# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	# x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	# x^(8*128+64)	mod G(x)
-.Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	# x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	# x^(4*128+64)	mod G(x)
-.Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	# x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	# x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	# x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	# x^(1*128+64)	mod G(x)
-.Lfinal_fold_consts:
-	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
-.Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	# G(x)
-	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
-
-.section	.rodata.cst16.mask1, "aM", @progbits, 16
-.align 16
-.Lmask1:
-	.octa	0x80808080808080808080808080808080
-
-.section	.rodata.cst16.mask2, "aM", @progbits, 16
-.align 16
-.Lmask2:
-	.octa	0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
-
-.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
-.align 16
-.Lbswap_mask:
-	.octa	0x000102030405060708090A0B0C0D0E0F
-
-.section	.rodata.cst32.byteshift_table, "aM", @progbits, 32
-.align 16
-# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
-# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
-# 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0

From patchwork Mon Nov 25 04:11:29 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Eric Biggers <ebiggers@kernel.org>
X-Patchwork-Id: 13884378
X-Patchwork-Delegate: herbert@gondor.apana.org.au
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A465F156F5D;
	Mon, 25 Nov 2024 04:12:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1732507970; cv=none;
 b=l13x/AILC1+/MEFK3Ike5t/W8JlxbcWC+Ak8cJTrHYy1WoDPip522Z6PK75GpjgMCo9AmBUamQzYN3QA+2cL0D3w0QnKhkVkQoTU+RcDv+DUAljOJ/9bkRiK6qNRMLvOaqOlOEljd6cVgbYxy/jn+/mVaMtdqwUrAgjC/4FK7x0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1732507970; c=relaxed/simple;
	bh=Im0exUZQfnAWeYCHy4KFp6kY8DqEkpJ4gHD7VgIxHhk=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=FcbgEZfMz1KK4b/sScuhNltsAInyB0/5WI0qpaA3W60q6KyWiwwUCUfPxc4MKOmO52n2oHM4QUNZ5L+YaK8bRhb6YRcVbXB4I61zP9Y49m8TlJC1aOrKStsZKiGQQhHayL8+UG7wnaldEnELfSrJfke+LSStaM2qgvdnRZGSZjc=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=RnFtx5tz; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="RnFtx5tz"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 11E19C4CEE4;
	Mon, 25 Nov 2024 04:12:50 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1732507970;
	bh=Im0exUZQfnAWeYCHy4KFp6kY8DqEkpJ4gHD7VgIxHhk=;
	h=From:To:Cc:Subject:Date:In-Reply-To:References:From;
	b=RnFtx5tzOnJoFKt8Ii9puP18+iDvpDwwprL0cSNvV6GQIdJOsytF1Vz1mzFjZDKcN
	 zZMyV7oDL6qfSk+I1yNiGycUnLRkAaxSC0rOx3mf6sBT0ylcW21i7vydL62tPZ7r9v
	 22F5Wi+E93hV87ew4cS4qg6Uux8RNuj/Fbl4cdoy5zBNHfrGnQBJRz17h34nvd3n4k
	 uTgwDoctpuEcgwfv8iaR+ycQZKu8UD/I/WSAgJaRmHT4/n7/2IZza8Ui87WKV3e13A
	 +rzpwjoAC6kkSvmAG+v2Vd+n46Q4PUxo4ilyaoyva5NfFNMeu5Kn068Er68mSfDdWb
	 DrCKGmWI6/M1g==
From: Eric Biggers <ebiggers@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: linux-crypto@vger.kernel.org,
	x86@kernel.org,
	Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 6/6] x86/crc32: implement crc32_be using new template
Date: Sun, 24 Nov 2024 20:11:29 -0800
Message-ID: <20241125041129.192999-7-ebiggers@kernel.org>
X-Mailer: git-send-email 2.47.0
In-Reply-To: <20241125041129.192999-1-ebiggers@kernel.org>
References: <20241125041129.192999-1-ebiggers@kernel.org>
Precedence: bulk
X-Mailing-List: linux-crypto@vger.kernel.org
List-Id: <linux-crypto.vger.kernel.org>
List-Subscribe: <mailto:linux-crypto+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-crypto+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Eric Biggers <ebiggers@google.com>

crc32_be was previously unoptimized on x86.  Optimize it using the new
template.  This improves performance by over 25x in some cases.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

	Length     Before        After
	------     ------        -----
	     1     389 MB/s      325 MB/s
	    16    2845 MB/s     2911 MB/s
	    64    3012 MB/s     6513 MB/s
	   127    2567 MB/s     9057 MB/s
	   128    3048 MB/s    11589 MB/s
	   200    3070 MB/s    14042 MB/s
	   256    3067 MB/s    20454 MB/s
	   511    2938 MB/s    26245 MB/s
	   512    3081 MB/s    36926 MB/s
	  1024    3090 MB/s    61914 MB/s
	  3173    3065 MB/s    76201 MB/s
	  4096    3084 MB/s    82547 MB/s
	 16384    3084 MB/s    89333 MB/s

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/lib/crc-pclmul-consts.h | 49 +++++++++++++++++++++++++++++++-
 arch/x86/lib/crc32-glue.c        |  4 +++
 arch/x86/lib/crc32-pclmul.S      |  1 +
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
index c3ca689eae3b8..f8af6e9278c83 100644
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ b/arch/x86/lib/crc-pclmul-consts.h
@@ -1,10 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * CRC constants generated by:
  *
- *	./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
+ *	./scripts/crc/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_msb_0x04c11db7
  *
  * Do not edit manually.
  */
 
 /*
@@ -97,5 +97,52 @@ static const struct {
 		0xb4e5b025f7011641,	/* floor(x^95 / G(x)) */
 		0x1db710641,	/* G(x) */
 	},
 	.extract_crc_mask = {0, 0xffffffff},
 };
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ *        x^5 + x^4 + x^2 + x + 1
+ */
+static const struct {
+	u8 bswap_mask[16];
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc32_msb_0x04c11db7_consts __cacheline_aligned __maybe_unused = {
+	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+	.fold_across_2048_bits_consts = {
+		0x88fe2237,	/* x^(2048+0) mod G(x) */
+		0xcbcf3bcb,	/* x^(2048+64) mod G(x) */
+	},
+	.fold_across_1024_bits_consts = {
+		0x567fddeb,	/* x^(1024+0) mod G(x) */
+		0x10bd4d7c,	/* x^(1024+64) mod G(x) */
+	},
+	.fold_across_512_bits_consts = {
+		0xe6228b11,	/* x^(512+0) mod G(x) */
+		0x8833794c,	/* x^(512+64) mod G(x) */
+	},
+	.fold_across_256_bits_consts = {
+		0x75be46b7,	/* x^(256+0) mod G(x) */
+		0x569700e5,	/* x^(256+64) mod G(x) */
+	},
+	.fold_across_128_bits_consts = {
+		0xe8a45605,	/* x^(128+0) mod G(x) */
+		0xc5b9cd4c,	/* x^(128+64) mod G(x) */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x04d101df481b4e5a,	/* floor(x^96 / G(x)) - x^64 */
+		0x104c11db7,	/* G(x) */
+	},
+};
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
index afcdeee429664..326261e503b42 100644
--- a/arch/x86/lib/crc32-glue.c
+++ b/arch/x86/lib/crc32-glue.c
@@ -18,10 +18,11 @@
 
 static DEFINE_STATIC_KEY_FALSE(have_crc32);
 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+DECLARE_CRC_PCLMUL_FUNCS(crc32_msb, u32);
 
 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
 		   have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
@@ -69,10 +70,12 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len)
 }
 EXPORT_SYMBOL(crc32c_le_arch);
 
 u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
 {
+	CRC_PCLMUL(crc, p, len, crc32_msb, crc32_msb_0x04c11db7_consts,
+		   have_pclmulqdq, IS_ENABLED(CONFIG_CRC32_SLICEBY8));
 	return crc32_be_base(crc, p, len);
 }
 EXPORT_SYMBOL(crc32_be_arch);
 
 static int __init crc32_x86_init(void)
@@ -80,10 +83,11 @@ static int __init crc32_x86_init(void)
 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
 		static_branch_enable(&have_crc32);
 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
 		INIT_CRC_PCLMUL(crc32_lsb);
+		INIT_CRC_PCLMUL(crc32_msb);
 	}
 	return 0;
 }
 arch_initcall(crc32_x86_init);
 
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
index cf07d571ae864..d562944211d4d 100644
--- a/arch/x86/lib/crc32-pclmul.S
+++ b/arch/x86/lib/crc32-pclmul.S
@@ -2,5 +2,6 @@
 // Copyright 2024 Google LLC
 
 #include "crc-pclmul-template.S"
 
 DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
+DEFINE_CRC_PCLMUL_FUNCS(crc32_msb, /* bits= */ 32, /* lsb= */ 0)