diff mbox series

[v3,2/2] crypto: Add Zhaoxin PadLock Hash Engine support for SHA384/SHA512

Message ID 20250114121301.156359-3-TonyWWang-oc@zhaoxin.com (mailing list archive)
State Under Review
Delegated to: Herbert Xu
Headers show
Series Add Zhaoxin hardware engine driver support for SHA | expand

Commit Message

Tony W Wang-oc Jan. 14, 2025, 12:13 p.m. UTC
Zhaoxin CPUs have implemented the SHA(Secure Hash Algorithm) as its CPU
instructions, including SHA1, SHA256, SHA384 and SHA512, which conform
to the Secure Hash Algorithms specified by FIPS 180-3.

Zhaoxin CPU's SHA1/SHA256 implementation is compatible with VIA's
SHA1/SHA256, so add Zhaoxin CPU's SHA384/SHA512 support in padlock-sha.c.

With the help of implementation of SHA in hardware instead of software,
can develop applications with higher performance, more security and more
flexibility.

Below table gives a summary of test using the driver tcrypt with different
crypt algorithm drivers on Zhaoxin KH-40000 platform:
---------------------------------------------------------------------------
tcrypt     driver   16*    64      256     1024    2048    4096    8192
---------------------------------------------------------------------------
           PadLock** 442.80 1309.21 3257.53 5221.56 5813.45 6136.39 6264.50***
403:SHA1   generic** 341.44 813.27  1458.98 1818.03 1896.60 1940.71 1939.06
           ratio    1.30   1.61    2.23    2.87    3.07    3.16    3.23
---------------------------------------------------------------------------
           Padlock  451.70 1313.65 2958.71 4658.55 5109.16 5359.08 5459.13
404:SHA256 generic  202.62 463.55  845.01  1070.50 1117.51 1144.79 1155.68
           ratio    2.23   2.83    3.50    4.35    4.57    4.68    4.72
---------------------------------------------------------------------------
           Padlock  350.90 1406.42 3166.16 5736.39 6627.77 7182.01 7429.18
405:SHA384 generic  161.76 654.88  979.06  1350.56 1423.08 1496.57 1513.12
           ratio    2.17   2.15    3.23    4.25    4.66    4.80    4.91
---------------------------------------------------------------------------
           Padlock  334.49 1394.71 3159.93 5728.86 6625.33 7169.23 7407.80
406:SHA512 generic  161.80 653.84  979.42  1351.41 1444.14 1495.35 1518.43
           ratio    2.07   2.13    3.23    4.24    4.59    4.79    4.88
---------------------------------------------------------------------------
*: The length of each data block to be processed by one complete SHA
   sequence, namely one INIT, multi UPDATEs and one FINAL.
**: Crypt algorithm driver used by tcrypt, "PadLock" represents padlock-sha
   while "generic" represents the generic software SHA driver.
***: The speed of each crypt algorithm driver processing different length
   of data blocks, unit is Mb/s.

The ratio in the table implies the performance of SHA implemented by
padlock-sha driver is much higher than the ones implemented by the generic
software driver of sha1/sha256/sha384/sha512.

Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
---
 drivers/crypto/Kconfig       |  10 +-
 drivers/crypto/padlock-sha.c | 200 ++++++++++++++++++++++++++++++++++-
 2 files changed, 202 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 19ab145f912e..0e97be36e037 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -39,15 +39,19 @@  config CRYPTO_DEV_PADLOCK_AES
 	  called padlock-aes.
 
 config CRYPTO_DEV_PADLOCK_SHA
-	tristate "PadLock driver for SHA1 and SHA256 algorithms"
+	tristate "PadLock driver for SHA1/SHA256/SHA384/SHA512 algorithms"
+	depends on X86 && !UML
 	depends on CRYPTO_DEV_PADLOCK
 	select CRYPTO_HASH
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
+	select CRYPTO_SHA512
 	help
-	  Use VIA PadLock for SHA1/SHA256 algorithms.
+	  Use PadLock for SHA1/SHA256 algorithms.
+	  Available in VIA C7 and newer processors, available in Zhaoxin processors.
 
-	  Available in VIA C7 and newer processors.
+	  Use PadLock for SHA384/SHA512 algorithms.
+	  Available in Zhaoxin processors.
 
 	  If unsure say M. The compiled module will be
 	  called padlock-sha.
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 6865c7f1fc1a..80af906184e2 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -5,6 +5,10 @@ 
  * Support for VIA PadLock hardware crypto engine.
  *
  * Copyright (c) 2006  Michal Ludvig <michal@logix.cz>
+ *
+ * Add SHA384/SHA512 support for Zhaoxin processors.
+ *
+ * Copyright (c) 2025  George Xue <georgexue@zhaoxin.com>
  */
 
 #include <crypto/internal/hash.h>
@@ -434,6 +438,123 @@  static int padlock_sha256_final_nano(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
+static inline void padlock_output_block_512(uint64_t *src, uint64_t *dst, size_t count)
+{
+	while (count--)
+		*dst++ = swab64(*src++);
+}
+
+static int padlock_sha384_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = { SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3, SHA384_H4, SHA384_H5,
+			   SHA384_H6, SHA384_H7 },
+		.count = { 0, 0 },
+	};
+
+	return 0;
+}
+
+static int padlock_sha512_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha512_state){
+		.state = { SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3, SHA512_H4, SHA512_H5,
+			   SHA512_H6, SHA512_H7 },
+		.count = { 0, 0 },
+	};
+
+	return 0;
+}
+
+static int padlock_sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial, done;
+	const u8 *src;
+	u8 buf[SHA512_BLOCK_SIZE];
+	u8 *dst = &buf[0];
+
+	partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	done = 0;
+	src = data;
+	memcpy(dst, sctx->state, SHA512_DIGEST_SIZE);
+
+	if ((partial + len) >= SHA512_BLOCK_SIZE) {
+		/* Append the bytes in state's buffer to a block to handle */
+		if (partial) {
+			done = -partial;
+			memcpy(sctx->buf + partial, data, done + SHA512_BLOCK_SIZE);
+
+			src = sctx->buf;
+
+			asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xe0"
+				     : "+S"(src), "+D"(dst)
+				     : "c"(1UL));
+
+			done += SHA512_BLOCK_SIZE;
+			src = data + done;
+		}
+
+		/* Process the left bytes from input data */
+		if (len - done >= SHA512_BLOCK_SIZE) {
+			asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xe0"
+				     : "+S"(src), "+D"(dst)
+				     : "c"((unsigned long)((len - done) / SHA512_BLOCK_SIZE)));
+
+			done += ((len - done) - (len - done) % SHA512_BLOCK_SIZE);
+			src = data + done;
+		}
+		partial = 0;
+	}
+
+	memcpy(sctx->state, dst, SHA512_DIGEST_SIZE);
+	memcpy(sctx->buf + partial, src, len - done);
+
+	return 0;
+}
+
+static int padlock_sha512_final(struct shash_desc *desc, u8 *out)
+{
+	const int bit_offset = SHA512_BLOCK_SIZE - sizeof(__be64[2]);
+	struct sha512_state *state = shash_desc_ctx(desc);
+	unsigned int partial = state->count[0] % SHA512_BLOCK_SIZE, padlen;
+	__be64 bits[2];
+
+	/* Both SHA384 and SHA512 may be supported. */
+	int dgst_size = crypto_shash_digestsize(desc->tfm);
+
+	static u8 padding[SHA512_BLOCK_SIZE];
+
+	memset(padding, 0, SHA512_BLOCK_SIZE);
+	padding[0] = 0x80;
+
+	/* Convert byte count in little endian to bit count in big endian. */
+	bits[0] = cpu_to_be64(state->count[1] << 3 | state->count[0] >> 61);
+	bits[1] = cpu_to_be64(state->count[0] << 3);
+
+	padlen = (partial < bit_offset) ? (bit_offset - partial) :
+					  ((SHA512_BLOCK_SIZE + bit_offset) - partial);
+
+	padlock_sha512_update(desc, padding, padlen);
+
+	/* Append length field bytes */
+	padlock_sha512_update(desc, (const u8 *)bits, sizeof(__be64[2]));
+
+	/* Swap to output */
+	padlock_output_block_512(state->state, (uint64_t *)out, dgst_size / sizeof(uint64_t));
+
+	return 0;
+}
+
 static int padlock_sha_export_nano(struct shash_desc *desc,
 				void *out)
 {
@@ -490,6 +611,42 @@  static struct shash_alg sha256_alg_nano = {
 	}
 };
 
+static struct shash_alg sha384_alg = {
+	.digestsize = SHA384_DIGEST_SIZE,
+	.init       = padlock_sha384_init,
+	.update     = padlock_sha512_update,
+	.final      = padlock_sha512_final,
+	.export     = padlock_sha_export_nano,
+	.import     = padlock_sha_import_nano,
+	.descsize   = sizeof(struct sha512_state),
+	.statesize  = sizeof(struct sha512_state),
+	.base       = {
+		.cra_name        = "sha384",
+		.cra_driver_name = "sha384-padlock-zhaoxin",
+		.cra_priority    = PADLOCK_CRA_PRIORITY,
+		.cra_blocksize   = SHA384_BLOCK_SIZE,
+		.cra_module      = THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha512_alg = {
+	.digestsize = SHA512_DIGEST_SIZE,
+	.init       = padlock_sha512_init,
+	.update     = padlock_sha512_update,
+	.final      = padlock_sha512_final,
+	.export     = padlock_sha_export_nano,
+	.import     = padlock_sha_import_nano,
+	.descsize   = sizeof(struct sha512_state),
+	.statesize  = sizeof(struct sha512_state),
+	.base       = {
+		.cra_name        = "sha512",
+		.cra_driver_name = "sha512-padlock-zhaoxin",
+		.cra_priority    = PADLOCK_CRA_PRIORITY,
+		.cra_blocksize   = SHA512_BLOCK_SIZE,
+		.cra_module      = THIS_MODULE,
+	}
+};
+
 static const struct x86_cpu_id padlock_sha_ids[] = {
 	X86_MATCH_FEATURE(X86_FEATURE_PHE, NULL),
 	{}
@@ -502,12 +659,16 @@  static int __init padlock_init(void)
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	struct shash_alg *sha1;
 	struct shash_alg *sha256;
+	struct shash_alg *sha384;
+	struct shash_alg *sha512;
 
 	if (!x86_match_cpu(padlock_sha_ids) || !boot_cpu_has(X86_FEATURE_PHE_EN))
 		return -ENODEV;
 
-	/* Register the newly added algorithm module if on *
-	* VIA Nano processor, or else just do as before */
+	/*
+	 * Register the newly added algorithm module if on
+	 * Zhaoxin/VIA Nano processor, or else just do as before
+	 */
 	if (c->x86_model < 0x0f) {
 		sha1 = &sha1_alg;
 		sha256 = &sha256_alg;
@@ -524,15 +685,34 @@  static int __init padlock_init(void)
 	if (rc)
 		goto out_unreg1;
 
-	printk(KERN_NOTICE PFX "Using VIA PadLock ACE for SHA1/SHA256 algorithms.\n");
+	printk(KERN_NOTICE PFX "Using PadLock ACE for SHA1/SHA256 algorithms.\n");
+
+	if (boot_cpu_has(X86_FEATURE_PHE2_EN)) {
+		sha384 = &sha384_alg;
+		sha512 = &sha512_alg;
+
+		rc = crypto_register_shash(sha384);
+		if (rc)
+			goto out_unreg2;
+
+		rc = crypto_register_shash(sha512);
+		if (rc)
+			goto out_unreg3;
+
+		printk(KERN_NOTICE PFX "Using PadLock ACE for SHA384/SHA512 algorithms.\n");
+	}
 
 	return 0;
 
+out_unreg3:
+	crypto_unregister_shash(sha384);
+out_unreg2:
+	crypto_unregister_shash(sha256);
 out_unreg1:
 	crypto_unregister_shash(sha1);
 
 out:
-	printk(KERN_ERR PFX "VIA PadLock SHA1/SHA256 initialization failed.\n");
+	printk(KERN_ERR PFX "PadLock SHA1/SHA256/SHA384/SHA5112 initialization failed.\n");
 	return rc;
 }
 
@@ -543,6 +723,11 @@  static void __exit padlock_fini(void)
 	if (c->x86_model >= 0x0f) {
 		crypto_unregister_shash(&sha1_alg_nano);
 		crypto_unregister_shash(&sha256_alg_nano);
+
+		if (boot_cpu_has(X86_FEATURE_PHE2_EN)) {
+			crypto_unregister_shash(&sha384_alg);
+			crypto_unregister_shash(&sha512_alg);
+		}
 	} else {
 		crypto_unregister_shash(&sha1_alg);
 		crypto_unregister_shash(&sha256_alg);
@@ -552,11 +737,16 @@  static void __exit padlock_fini(void)
 module_init(padlock_init);
 module_exit(padlock_fini);
 
-MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support.");
+MODULE_DESCRIPTION("PadLock SHA1/SHA256/SHA384/SHA512 algorithms support.");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michal Ludvig");
+MODULE_AUTHOR("George Xue <georgexue@zhaoxin.com>");
 
 MODULE_ALIAS_CRYPTO("sha1-all");
 MODULE_ALIAS_CRYPTO("sha256-all");
+MODULE_ALIAS_CRYPTO("sha384-all");
+MODULE_ALIAS_CRYPTO("sha512-all");
 MODULE_ALIAS_CRYPTO("sha1-padlock");
 MODULE_ALIAS_CRYPTO("sha256-padlock");
+MODULE_ALIAS_CRYPTO("sha384-padlock");
+MODULE_ALIAS_CRYPTO("sha512-padlock");