diff mbox series

[v4,10/12] RISC-V: crypto: add Zvkned accelerated AES encryption implementation

Message ID 20230711153743.1970625-11-heiko@sntech.de (mailing list archive)
State Superseded
Headers show
Series RISC-V: support some cryptography accelerations | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master

Commit Message

Heiko Stübner July 11, 2023, 3:37 p.m. UTC
From: Heiko Stuebner <heiko.stuebner@vrull.eu>

This adds an AES implementation using the Zvkned vector crypto instructions.

Co-developed-by: Christoph Müllner <christoph.muellner@vrull.eu>
Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
---
 arch/riscv/crypto/Kconfig               |  14 +
 arch/riscv/crypto/Makefile              |   7 +
 arch/riscv/crypto/aes-riscv-glue.c      | 168 ++++++++
 arch/riscv/crypto/aes-riscv64-zvkned.pl | 530 ++++++++++++++++++++++++
 4 files changed, 719 insertions(+)
 create mode 100644 arch/riscv/crypto/aes-riscv-glue.c
 create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned.pl

Comments

Eric Biggers July 21, 2023, 5:40 a.m. UTC | #1
On Tue, Jul 11, 2023 at 05:37:41PM +0200, Heiko Stuebner wrote:
> +config CRYPTO_AES_RISCV
> +	tristate "Ciphers: AES (RISCV)"
> +	depends on 64BIT && RISCV_ISA_V
> +	select CRYPTO_AES
> +	help
> +	  Block ciphers: AES cipher algorithms (FIPS-197)
> +	  Length-preserving ciphers: AES with ECB, CBC, CTR, CTS,
> +	    XCTR, and XTS modes
> +	  AEAD cipher: AES with CBC, ESSIV, and SHA-256
> +	    for fscrypt and dm-crypt
> +
> +	  Architecture: riscv using one of
> +	  - Zvkns

I'm looking forward to having direct support for these AES modes, especially the
modes needed for storage encryption: XTS, and CBC or CTS!  None of these AES
modes is actually implemented in this patch yet, though, so they can't be
claimed in the kconfig help text yet.  This patch is just a starting point, as
it just adds support for the bare AES block cipher ("aes" in the crypto API).

(BTW, I'm much more interested in, say, AES-XTS support than SM4 support, which
this patchset does include.  SM4 is a "national pride cipher" which is somewhat
of a niche thing.  I suppose there are already people pushing it for RISC-V
though, as they are everywhere else, so that's to be expected...)

> diff --git a/arch/riscv/crypto/aes-riscv-glue.c b/arch/riscv/crypto/aes-riscv-glue.c
> new file mode 100644
> index 000000000000..85e1187aee22
> --- /dev/null
> +++ b/arch/riscv/crypto/aes-riscv-glue.c
> @@ -0,0 +1,168 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Linux/riscv port of the OpenSSL AES implementation for RISCV
> + *
> + * Copyright (C) 2023 VRULL GmbH
> + * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
> + */
> +
> +#include <linux/crypto.h>
> +#include <linux/delay.h>
> +#include <linux/err.h>
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/aes.h>
> +#include <crypto/internal/cipher.h>
> +#include <crypto/internal/simd.h>
> +
> +struct aes_key {
> +	u8 key[AES_MAX_KEYLENGTH];
> +	int rounds;
> +};
> +
> +/* variant using the zvkned vector crypto extension */
> +void rv64i_zvkned_encrypt(const u8 *in, u8 *out, const struct aes_key *key);
> +void rv64i_zvkned_decrypt(const u8 *in, u8 *out, const struct aes_key *key);
> +int rv64i_zvkned_set_encrypt_key(const u8 *userKey, const int bits,
> +				struct aes_key *key);
> +int rv64i_zvkned_set_decrypt_key(const u8 *userKey, const int bits,
> +				struct aes_key *key);
> +
> +struct riscv_aes_ctx {
> +	struct crypto_cipher *fallback;
> +	struct aes_key enc_key;
> +	struct aes_key dec_key;
> +	unsigned int keylen;
> +};

Can it just use 'struct crypto_aes_ctx'?  That's what most of the other AES
implementations use.

> +static int riscv64_aes_init_zvkned(struct crypto_tfm *tfm)
> +{
> +	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> +	const char *alg = crypto_tfm_alg_name(tfm);
> +	struct crypto_cipher *fallback;
> +
> +	fallback = crypto_alloc_cipher(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
> +	if (IS_ERR(fallback)) {
> +		pr_err("Failed to allocate transformation for '%s': %ld\n",
> +		       alg, PTR_ERR(fallback));
> +		return PTR_ERR(fallback);
> +	}
> +
> +	crypto_cipher_set_flags(fallback,
> +				crypto_cipher_get_flags((struct
> +							 crypto_cipher *)
> +							tfm));
> +	ctx->fallback = fallback;
> +
> +	return 0;
> +}
> +
> +static void riscv_aes_exit(struct crypto_tfm *tfm)
> +{
> +	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> +
> +	if (ctx->fallback) {
> +		crypto_free_cipher(ctx->fallback);
> +		ctx->fallback = NULL;
> +	}
> +}
> +
> +static int riscv64_aes_setkey_zvkned(struct crypto_tfm *tfm, const u8 *key,
> +			 unsigned int keylen)
> +{
> +	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> +	int ret;
> +
> +	ctx->keylen = keylen;
> +
> +	if (keylen == 16 || keylen == 32) {
> +		kernel_rvv_begin();
> +		ret = rv64i_zvkned_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
> +		if (ret != 1) {
> +			kernel_rvv_end();
> +			return -EINVAL;
> +		}
> +
> +		ret = rv64i_zvkned_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
> +		kernel_rvv_end();
> +		if (ret != 1)
> +			return -EINVAL;
> +	}
> +
> +	ret = crypto_cipher_setkey(ctx->fallback, key, keylen);
> +
> +	return ret ? -EINVAL : 0;
> +}

It's a bit annoying that RISC-V doesn't support AES-192, though also not
particularly surprising, seeing as AES-192 is almost never used.  (Intel's Key
Locker, for example, is another recent CPU feature that doesn't support
AES-192.)  IMO the issue here is really with the kernel crypto API -- it should
treat AES-128, AES-192, and AES-256 as separate algorithms so that
implementations aren't forced to support all three key sizes...

Anyway, for now, as you noticed you do need a fallback to handle AES-192 to make
the kernel crypto API happy.

But, the fallback doesn't have to be a 'crypto_cipher' as you've implemented.
You could just use the AES library.  See what arch/arm64/crypto/aes-ce-glue.c
does, for example.  Have you considered that?  It would be simpler than the
crypto_cipher based approach.

> +
> +static void riscv64_aes_encrypt_zvkned(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
> +{
> +	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);

Always use 'const' for the tfm_ctx in encrypt and decrypt functions, please, as
it must never be modified there.

> +struct crypto_alg riscv64_aes_zvkned_alg = {

static

> +	.cra_type = NULL,

Omit that line

> +	.cra_alignmask = 0,

Omit that line

> +MODULE_DESCRIPTION("AES (accelerated)");

Maybe "RISC-V accelerated"?

- Eric
Ard Biesheuvel July 21, 2023, 11:39 a.m. UTC | #2
On Fri, 21 Jul 2023 at 07:40, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Tue, Jul 11, 2023 at 05:37:41PM +0200, Heiko Stuebner wrote:
...
> > +static int riscv64_aes_setkey_zvkned(struct crypto_tfm *tfm, const u8 *key,
> > +                      unsigned int keylen)
> > +{
> > +     struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> > +     int ret;
> > +
> > +     ctx->keylen = keylen;
> > +
> > +     if (keylen == 16 || keylen == 32) {
> > +             kernel_rvv_begin();
> > +             ret = rv64i_zvkned_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
> > +             if (ret != 1) {
> > +                     kernel_rvv_end();
> > +                     return -EINVAL;
> > +             }
> > +
> > +             ret = rv64i_zvkned_set_decrypt_key(key, keylen * 8, &ctx->dec_key);

The asm suggests that the encryption and decryption key schedules are
the same, and the decryption algorithm does not implement the
Equivalent Inverse Cipher, but simply iterates over they key schedule
in reverse order. This makes much more sense for instruction based
AES, so it doesn't surprise me but it does mean you can just drop this
part, and pass enc_key everywhere.

> > +             kernel_rvv_end();
> > +             if (ret != 1)
> > +                     return -EINVAL;
> > +     }
> > +
> > +     ret = crypto_cipher_setkey(ctx->fallback, key, keylen);
> > +
> > +     return ret ? -EINVAL : 0;
> > +}
>
> It's a bit annoying that RISC-V doesn't support AES-192, though also not
> particularly surprising, seeing as AES-192 is almost never used.  (Intel's Key
> Locker, for example, is another recent CPU feature that doesn't support
> AES-192.)  IMO the issue here is really with the kernel crypto API -- it should
> treat AES-128, AES-192, and AES-256 as separate algorithms so that
> implementations aren't forced to support all three key sizes...
>

Why is this a fundamental limitation? AES-192 uses the same AES block
size and round structure, the only difference is the number of rounds
and how the round keys are calculated.

Creating the key schedule should never be performance critical, so if
the lack of AES-192 support is due to a limitation in the key schedule
generation instructions, I'd suggest to avoid those if possible and
just use the generic library code to derive the key schedule. If that
works, I'm pretty sure AES-192 support is just a matter of
implementing a 12-round variant modeled after the existing 10/14 round
ones.
Ard Biesheuvel July 21, 2023, 2:23 p.m. UTC | #3
On Fri, 21 Jul 2023 at 13:39, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Fri, 21 Jul 2023 at 07:40, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Tue, Jul 11, 2023 at 05:37:41PM +0200, Heiko Stuebner wrote:
> ...
> > > +static int riscv64_aes_setkey_zvkned(struct crypto_tfm *tfm, const u8 *key,
> > > +                      unsigned int keylen)
> > > +{
> > > +     struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> > > +     int ret;
> > > +
> > > +     ctx->keylen = keylen;
> > > +
> > > +     if (keylen == 16 || keylen == 32) {
> > > +             kernel_rvv_begin();
> > > +             ret = rv64i_zvkned_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
> > > +             if (ret != 1) {
> > > +                     kernel_rvv_end();
> > > +                     return -EINVAL;
> > > +             }
> > > +
> > > +             ret = rv64i_zvkned_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
>
> The asm suggests that the encryption and decryption key schedules are
> the same, and the decryption algorithm does not implement the
> Equivalent Inverse Cipher, but simply iterates over they key schedule
> in reverse order. This makes much more sense for instruction based
> AES, so it doesn't surprise me but it does mean you can just drop this
> part, and pass enc_key everywhere.
>
> > > +             kernel_rvv_end();
> > > +             if (ret != 1)
> > > +                     return -EINVAL;
> > > +     }
> > > +
> > > +     ret = crypto_cipher_setkey(ctx->fallback, key, keylen);
> > > +
> > > +     return ret ? -EINVAL : 0;
> > > +}
> >
> > It's a bit annoying that RISC-V doesn't support AES-192, though also not
> > particularly surprising, seeing as AES-192 is almost never used.  (Intel's Key
> > Locker, for example, is another recent CPU feature that doesn't support
> > AES-192.)  IMO the issue here is really with the kernel crypto API -- it should
> > treat AES-128, AES-192, and AES-256 as separate algorithms so that
> > implementations aren't forced to support all three key sizes...
> >
>
> Why is this a fundamental limitation? AES-192 uses the same AES block
> size and round structure, the only difference is the number of rounds
> and how the round keys are calculated.
>
> Creating the key schedule should never be performance critical, so if
> the lack of AES-192 support is due to a limitation in the key schedule
> generation instructions, I'd suggest to avoid those if possible and
> just use the generic library code to derive the key schedule. If that
> works, I'm pretty sure AES-192 support is just a matter of
> implementing a 12-round variant modeled after the existing 10/14 round
> ones.

This seems to work:
https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/log/?h=riscv-crypto

Feel free to incorporate/squash any of those changes into your series.
Jerry Shih Sept. 11, 2023, 1:06 p.m. UTC | #4
On Jul 21, 2023, at 13:40, Eric Biggers <ebiggers@kernel.org> wrote:

> I'm looking forward to having direct support for these AES modes, especially the
> modes needed for storage encryption: XTS, and CBC or CTS!  None of these AES
> modes is actually implemented in this patch yet, though, so they can't be
> claimed in the kconfig help text yet.  This patch is just a starting point, as
> it just adds support for the bare AES block cipher ("aes" in the crypto API).
> 
> (BTW, I'm much more interested in, say, AES-XTS support than SM4 support, which
> this patchset does include.  SM4 is a "national pride cipher" which is somewhat
> of a niche thing.  I suppose there are already people pushing it for RISC-V
> though, as they are everywhere else, so that's to be expected...)
> 

We have further optimization for RISC-V platform in OpenSSL PR[1]. It will include
AES with CBC, CTR, and XTS mode. Comparing to the generic AES implementation,
the specialized AES-XTS one have about 3X performance improvement using
OpenSSL benchmark tool. If OpenSSL accepts that PR, we will create the
corresponding patch for Linux kernel.

[1]
https://github.com/openssl/openssl/pull/21923

-Jerry
Ard Biesheuvel Sept. 12, 2023, 7:04 a.m. UTC | #5
On Tue, 12 Sept 2023 at 00:50, Jerry Shih <jerry.shih@sifive.com> wrote:
>
> On Jul 21, 2023, at 13:40, Eric Biggers <ebiggers@kernel.org> wrote:
>
> > I'm looking forward to having direct support for these AES modes, especially the
> > modes needed for storage encryption: XTS, and CBC or CTS!  None of these AES
> > modes is actually implemented in this patch yet, though, so they can't be
> > claimed in the kconfig help text yet.  This patch is just a starting point, as
> > it just adds support for the bare AES block cipher ("aes" in the crypto API).
> >
> > (BTW, I'm much more interested in, say, AES-XTS support than SM4 support, which
> > this patchset does include.  SM4 is a "national pride cipher" which is somewhat
> > of a niche thing.  I suppose there are already people pushing it for RISC-V
> > though, as they are everywhere else, so that's to be expected...)
> >
>
> We have further optimization for RISC-V platform in OpenSSL PR[1]. It will include
> AES with CBC, CTR, and XTS mode. Comparing to the generic AES implementation,
> the specialized AES-XTS one have about 3X performance improvement using
> OpenSSL benchmark tool. If OpenSSL accepts that PR, we will create the
> corresponding patch for Linux kernel.
>
> [1]
> https://github.com/openssl/openssl/pull/21923
>

This pull request doesn't appear to contain any XTS code at all, only CBC.
Jerry Shih Sept. 12, 2023, 7:15 a.m. UTC | #6
On Sep 12, 2023, at 15:04, Ard Biesheuvel <ardb@kernel.org> wrote:

>> We have further optimization for RISC-V platform in OpenSSL PR[1]. It will include
>> AES with CBC, CTR, and XTS mode. Comparing to the generic AES implementation,
>> the specialized AES-XTS one have about 3X performance improvement using
>> OpenSSL benchmark tool. If OpenSSL accepts that PR, we will create the
>> corresponding patch for Linux kernel.
>> 
>> [1]
>> https://github.com/openssl/openssl/pull/21923
>> 
> 
> This pull request doesn't appear to contain any XTS code at all, only CBC.

We have some license issues for upstream. We will append the specialized
AES modes soon.

-Jerry
He-Jie Shih Sept. 15, 2023, 1:28 a.m. UTC | #7
On Sep 12, 2023, at 15:15, Jerry Shih <jerry.shih@sifive.com> wrote:

>> This pull request doesn't appear to contain any XTS code at all, only CBC.
> 
> We have some license issues for upstream. We will append the specialized
> AES modes soon.

We have the XTS and other specialized AES modes in OpenSSL PR[1] now.
The specialized implementations all perform better than generic implementation
on FPGA.
We will try to make that implementations happen in kernel.

-Jerry

[1]
https://github.com/openssl/openssl/pull/21923
diff mbox series

Patch

diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index e564f861d95e..8579ce43546d 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -2,6 +2,20 @@ 
 
 menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
 
+config CRYPTO_AES_RISCV
+	tristate "Ciphers: AES (RISCV)"
+	depends on 64BIT && RISCV_ISA_V
+	select CRYPTO_AES
+	help
+	  Block ciphers: AES cipher algorithms (FIPS-197)
+	  Length-preserving ciphers: AES with ECB, CBC, CTR, CTS,
+	    XCTR, and XTS modes
+	  AEAD cipher: AES with CBC, ESSIV, and SHA-256
+	    for fscrypt and dm-crypt
+
+	  Architecture: riscv using one of
+	  - Zvkns
+
 config CRYPTO_GHASH_RISCV64
 	tristate "Hash functions: GHASH"
 	depends on 64BIT && (RISCV_ISA_ZBC || RISCV_ISA_V)
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
index b12c925172db..38ee741a9777 100644
--- a/arch/riscv/crypto/Makefile
+++ b/arch/riscv/crypto/Makefile
@@ -3,6 +3,9 @@ 
 # linux/arch/riscv/crypto/Makefile
 #
 
+obj-$(CONFIG_CRYPTO_AES_RISCV) += aes-riscv.o
+aes-riscv-y := aes-riscv-glue.o aes-riscv64-zvkned.o
+
 obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
 ghash-riscv64-y := ghash-riscv64-glue.o
 ifdef CONFIG_RISCV_ISA_ZBC
@@ -21,6 +24,9 @@  sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvbb-zvknhb.o
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
+$(obj)/aes-riscv64-zvkned.S: $(src)/aes-riscv64-zvkned.pl
+	$(call cmd,perlasm)
+
 $(obj)/ghash-riscv64-zbc.S: $(src)/ghash-riscv64-zbc.pl
 	$(call cmd,perlasm)
 
@@ -36,5 +42,6 @@  $(obj)/sha256-riscv64-zvbb-zvknha.S: $(src)/sha256-riscv64-zvbb-zvknha.pl
 $(obj)/sha512-riscv64-zvbb-zvknhb.S: $(src)/sha512-riscv64-zvbb-zvknhb.pl
 	$(call cmd,perlasm)
 
+clean-files += aes-riscv64-zvkned.S
 clean-files += ghash-riscv64-zbc.S ghash-riscv64-zvkb.S ghash-riscv64-zvkg.S
 clean-files += sha256-riscv64-zvknha.S sha512-riscv64-zvknhb.S
diff --git a/arch/riscv/crypto/aes-riscv-glue.c b/arch/riscv/crypto/aes-riscv-glue.c
new file mode 100644
index 000000000000..85e1187aee22
--- /dev/null
+++ b/arch/riscv/crypto/aes-riscv-glue.c
@@ -0,0 +1,168 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux/riscv port of the OpenSSL AES implementation for RISCV
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ */
+
+#include <linux/crypto.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
+#include <crypto/internal/simd.h>
+
+struct aes_key {
+	u8 key[AES_MAX_KEYLENGTH];
+	int rounds;
+};
+
+/* variant using the zvkned vector crypto extension */
+void rv64i_zvkned_encrypt(const u8 *in, u8 *out, const struct aes_key *key);
+void rv64i_zvkned_decrypt(const u8 *in, u8 *out, const struct aes_key *key);
+int rv64i_zvkned_set_encrypt_key(const u8 *userKey, const int bits,
+				struct aes_key *key);
+int rv64i_zvkned_set_decrypt_key(const u8 *userKey, const int bits,
+				struct aes_key *key);
+
+struct riscv_aes_ctx {
+	struct crypto_cipher *fallback;
+	struct aes_key enc_key;
+	struct aes_key dec_key;
+	unsigned int keylen;
+};
+
+static int riscv64_aes_init_zvkned(struct crypto_tfm *tfm)
+{
+	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	const char *alg = crypto_tfm_alg_name(tfm);
+	struct crypto_cipher *fallback;
+
+	fallback = crypto_alloc_cipher(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
+	if (IS_ERR(fallback)) {
+		pr_err("Failed to allocate transformation for '%s': %ld\n",
+		       alg, PTR_ERR(fallback));
+		return PTR_ERR(fallback);
+	}
+
+	crypto_cipher_set_flags(fallback,
+				crypto_cipher_get_flags((struct
+							 crypto_cipher *)
+							tfm));
+	ctx->fallback = fallback;
+
+	return 0;
+}
+
+static void riscv_aes_exit(struct crypto_tfm *tfm)
+{
+	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (ctx->fallback) {
+		crypto_free_cipher(ctx->fallback);
+		ctx->fallback = NULL;
+	}
+}
+
+static int riscv64_aes_setkey_zvkned(struct crypto_tfm *tfm, const u8 *key,
+			 unsigned int keylen)
+{
+	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int ret;
+
+	ctx->keylen = keylen;
+
+	if (keylen == 16 || keylen == 32) {
+		kernel_rvv_begin();
+		ret = rv64i_zvkned_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
+		if (ret != 1) {
+			kernel_rvv_end();
+			return -EINVAL;
+		}
+
+		ret = rv64i_zvkned_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
+		kernel_rvv_end();
+		if (ret != 1)
+			return -EINVAL;
+	}
+
+	ret = crypto_cipher_setkey(ctx->fallback, key, keylen);
+
+	return ret ? -EINVAL : 0;
+}
+
+static void riscv64_aes_encrypt_zvkned(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (crypto_simd_usable() && (ctx->keylen == 16 || ctx->keylen == 32)) {
+		kernel_rvv_begin();
+		rv64i_zvkned_encrypt(src, dst, &ctx->enc_key);
+		kernel_rvv_end();
+	} else {
+		crypto_cipher_encrypt_one(ctx->fallback, dst, src);
+	}
+}
+
+static void riscv64_aes_decrypt_zvkned(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct riscv_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (crypto_simd_usable() && (ctx->keylen == 16 || ctx->keylen == 32)) {
+		kernel_rvv_begin();
+		rv64i_zvkned_decrypt(src, dst, &ctx->dec_key);
+		kernel_rvv_end();
+	} else {
+		crypto_cipher_decrypt_one(ctx->fallback, dst, src);
+	}
+}
+
+struct crypto_alg riscv64_aes_zvkned_alg = {
+	.cra_name = "aes",
+	.cra_driver_name = "riscv-aes-zvkned",
+	.cra_module = THIS_MODULE,
+	.cra_priority = 300,
+	.cra_type = NULL,
+	.cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_NEED_FALLBACK,
+	.cra_alignmask = 0,
+	.cra_blocksize = AES_BLOCK_SIZE,
+	.cra_ctxsize = sizeof(struct riscv_aes_ctx),
+	.cra_init = riscv64_aes_init_zvkned,
+	.cra_exit = riscv_aes_exit,
+	.cra_cipher = {
+		.cia_min_keysize = AES_MIN_KEY_SIZE,
+		.cia_max_keysize = AES_MAX_KEY_SIZE,
+		.cia_setkey = riscv64_aes_setkey_zvkned,
+		.cia_encrypt = riscv64_aes_encrypt_zvkned,
+		.cia_decrypt = riscv64_aes_decrypt_zvkned,
+	},
+};
+
+static int __init riscv_aes_mod_init(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKNED) &&
+	    riscv_vector_vlen() >= 128)
+		return crypto_register_alg(&riscv64_aes_zvkned_alg);
+
+	return 0;
+}
+
+static void __exit riscv_aes_mod_fini(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKNED) &&
+	    riscv_vector_vlen() >= 128)
+		return crypto_unregister_alg(&riscv64_aes_zvkned_alg);
+}
+
+module_init(riscv_aes_mod_init);
+module_exit(riscv_aes_mod_fini);
+
+MODULE_DESCRIPTION("AES (accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.pl b/arch/riscv/crypto/aes-riscv64-zvkned.pl
new file mode 100644
index 000000000000..d26eeb8932bd
--- /dev/null
+++ b/arch/riscv/crypto/aes-riscv64-zvkned.pl
@@ -0,0 +1,530 @@ 
+#! /usr/bin/env perl
+# SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause
+#
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V vector ('V') with VLEN >= 128
+# - RISC-V vector crypto AES extension ('Zvkned')
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# int rv64i_zvkned_set_encrypt_key(const unsigned char *userKey, const int bits,
+#                                  AES_KEY *key)
+# int rv64i_zvkned_set_decrypt_key(const unsigned char *userKey, const int bits,
+#                                  AES_KEY *key)
+{
+my ($UKEY,$BITS,$KEYP) = ("a0", "a1", "a2");
+my ($T0,$T1,$T4) = ("t1", "t2", "t4");
+my ($v0,  $v1,  $v2,  $v3,  $v4,  $v5,  $v6,
+          $v7,  $v8,  $v9,  $v10, $v11, $v12,
+          $v13, $v14, $v15, $v16, $v17, $v18,
+          $v19, $v20, $v21, $v22, $v23, $v24,
+) = map("v$_",(0..24));
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_set_encrypt_key
+.type rv64i_zvkned_set_encrypt_key,\@function
+rv64i_zvkned_set_encrypt_key:
+    beqz $UKEY, L_fail_m1
+    beqz $KEYP, L_fail_m1
+
+    # Get proper routine for key size
+    li $T0, 256
+    beq $BITS, $T0, L_set_key_256
+    li $T0, 128
+    beq $BITS, $T0, L_set_key_128
+
+    j L_fail_m2
+
+.size rv64i_zvkned_set_encrypt_key,.-rv64i_zvkned_set_encrypt_key
+___
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_set_decrypt_key
+.type rv64i_zvkned_set_decrypt_key,\@function
+rv64i_zvkned_set_decrypt_key:
+    beqz $UKEY, L_fail_m1
+    beqz $KEYP, L_fail_m1
+
+    # Get proper routine for key size
+    li $T0, 256
+    beq $BITS, $T0, L_set_key_256
+    li $T0, 128
+    beq $BITS, $T0, L_set_key_128
+
+    j L_fail_m2
+
+.size rv64i_zvkned_set_decrypt_key,.-rv64i_zvkned_set_decrypt_key
+___
+
+$code .= <<___;
+.p2align 3
+L_set_key_128:
+    # Store the number of rounds
+    li $T1, 10
+    sw $T1, 240($KEYP)
+
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    # Load the key
+    @{[vle32_v $v10, ($UKEY)]}
+
+    # Generate keys for round 2-11 into registers v11-v20.
+    @{[vaeskf1_vi $v11, $v10, 1]}   # v11 <- rk2  (w[ 4, 7])
+    @{[vaeskf1_vi $v12, $v11, 2]}   # v12 <- rk3  (w[ 8,11])
+    @{[vaeskf1_vi $v13, $v12, 3]}   # v13 <- rk4  (w[12,15])
+    @{[vaeskf1_vi $v14, $v13, 4]}   # v14 <- rk5  (w[16,19])
+    @{[vaeskf1_vi $v15, $v14, 5]}   # v15 <- rk6  (w[20,23])
+    @{[vaeskf1_vi $v16, $v15, 6]}   # v16 <- rk7  (w[24,27])
+    @{[vaeskf1_vi $v17, $v16, 7]}   # v17 <- rk8  (w[28,31])
+    @{[vaeskf1_vi $v18, $v17, 8]}   # v18 <- rk9  (w[32,35])
+    @{[vaeskf1_vi $v19, $v18, 9]}   # v19 <- rk10 (w[36,39])
+    @{[vaeskf1_vi $v20, $v19, 10]}  # v20 <- rk11 (w[40,43])
+
+    # Store the round keys
+    @{[vse32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v20, ($KEYP)]}
+
+    li a0, 1
+    ret
+.size L_set_key_128,.-L_set_key_128
+___
+
+$code .= <<___;
+.p2align 3
+L_set_key_256:
+    # Store the number of rounds
+    li $T1, 14
+    sw $T1, 240($KEYP)
+
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    # Load the key
+    @{[vle32_v $v10, ($UKEY)]}
+    addi $UKEY, $UKEY, 16
+    @{[vle32_v $v11, ($UKEY)]}
+
+    @{[vmv_v_v $v12, $v10]}
+    @{[vaeskf2_vi $v12, $v11, 2]}
+    @{[vmv_v_v $v13, $v11]}
+    @{[vaeskf2_vi $v13, $v12, 3]}
+    @{[vmv_v_v $v14, $v12]}
+    @{[vaeskf2_vi $v14, $v13, 4]}
+    @{[vmv_v_v $v15, $v13]}
+    @{[vaeskf2_vi $v15, $v14, 5]}
+    @{[vmv_v_v $v16, $v14]}
+    @{[vaeskf2_vi $v16, $v15, 6]}
+    @{[vmv_v_v $v17, $v15]}
+    @{[vaeskf2_vi $v17, $v16, 7]}
+    @{[vmv_v_v $v18, $v16]}
+    @{[vaeskf2_vi $v18, $v17, 8]}
+    @{[vmv_v_v $v19, $v17]}
+    @{[vaeskf2_vi $v19, $v18, 9]}
+    @{[vmv_v_v $v20, $v18]}
+    @{[vaeskf2_vi $v20, $v19, 10]}
+    @{[vmv_v_v $v21, $v19]}
+    @{[vaeskf2_vi $v21, $v20, 11]}
+    @{[vmv_v_v $v22, $v20]}
+    @{[vaeskf2_vi $v22, $v21, 12]}
+    @{[vmv_v_v $v23, $v21]}
+    @{[vaeskf2_vi $v23, $v22, 13]}
+    @{[vmv_v_v $v24, $v22]}
+    @{[vaeskf2_vi $v24, $v23, 14]}
+
+    @{[vse32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v20, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v21, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v22, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v23, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vse32_v $v24, ($KEYP)]}
+
+    li a0, 1
+    ret
+.size L_set_key_256,.-L_set_key_256
+___
+}
+
+################################################################################
+# void rv64i_zvkned_encrypt(const unsigned char *in, unsigned char *out,
+#                           const AES_KEY *key);
+{
+my ($INP,$OUTP,$KEYP) = ("a0", "a1", "a2");
+my ($T0,$T1, $rounds, $T6) = ("a3", "a4", "t5", "t6");
+my ($v0,  $v1,  $v2,  $v3,  $v4,  $v5,  $v6,
+          $v7,  $v8,  $v9,  $v10, $v11, $v12,
+          $v13, $v14, $v15, $v16, $v17, $v18,
+          $v19, $v20, $v21, $v22, $v23, $v24,
+) = map("v$_",(0..24));
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_encrypt
+.type rv64i_zvkned_encrypt,\@function
+rv64i_zvkned_encrypt:
+    # Load number of rounds
+    lwu     $rounds, 240($KEYP)
+
+    # Get proper routine for key size
+    li $T6, 14
+    beq $rounds, $T6, L_enc_256
+    li $T6, 10
+    beq $rounds, $T6, L_enc_128
+
+    j L_fail_m2
+.size rv64i_zvkned_encrypt,.-rv64i_zvkned_encrypt
+___
+
+$code .= <<___;
+.p2align 3
+L_enc_128:
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    @{[vle32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v20, ($KEYP)]}
+
+    @{[vle32_v $v1, ($INP)]}
+
+    @{[vaesz_vs $v1, $v10]}    # with round key w[ 0, 3]
+    @{[vaesem_vs $v1, $v11]}   # with round key w[ 4, 7]
+    @{[vaesem_vs $v1, $v12]}   # with round key w[ 8,11]
+    @{[vaesem_vs $v1, $v13]}   # with round key w[12,15]
+    @{[vaesem_vs $v1, $v14]}   # with round key w[16,19]
+    @{[vaesem_vs $v1, $v15]}   # with round key w[20,23]
+    @{[vaesem_vs $v1, $v16]}   # with round key w[24,27]
+    @{[vaesem_vs $v1, $v17]}   # with round key w[28,31]
+    @{[vaesem_vs $v1, $v18]}   # with round key w[32,35]
+    @{[vaesem_vs $v1, $v19]}   # with round key w[36,39]
+    @{[vaesef_vs $v1, $v20]}   # with round key w[40,43]
+
+    @{[vse32_v $v1, ($OUTP)]}
+
+    ret
+.size L_enc_128,.-L_enc_128
+___
+
+$code .= <<___;
+.p2align 3
+L_enc_256:
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    @{[vle32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v20, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v21, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v22, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v23, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v24, ($KEYP)]}
+
+    @{[vle32_v $v1, ($INP)]}
+
+    @{[vaesz_vs $v1, $v10]}     # with round key w[ 0, 3]
+    @{[vaesem_vs $v1, $v11]}
+    @{[vaesem_vs $v1, $v12]}
+    @{[vaesem_vs $v1, $v13]}
+    @{[vaesem_vs $v1, $v14]}
+    @{[vaesem_vs $v1, $v15]}
+    @{[vaesem_vs $v1, $v16]}
+    @{[vaesem_vs $v1, $v17]}
+    @{[vaesem_vs $v1, $v18]}
+    @{[vaesem_vs $v1, $v19]}
+    @{[vaesem_vs $v1, $v20]}
+    @{[vaesem_vs $v1, $v21]}
+    @{[vaesem_vs $v1, $v22]}
+    @{[vaesem_vs $v1, $v23]}
+    @{[vaesef_vs $v1, $v24]}
+
+    @{[vse32_v $v1, ($OUTP)]}
+    ret
+.size L_enc_256,.-L_enc_256
+___
+}
+
+################################################################################
+# void rv64i_zvkned_decrypt(const unsigned char *in, unsigned char *out,
+#                           const AES_KEY *key);
+{
+my ($INP,$OUTP,$KEYP) = ("a0", "a1", "a2");
+my ($T0,$T1, $rounds, $T6) = ("a3", "a4", "t5", "t6");
+my ($v0,  $v1,  $v2,  $v3,  $v4,  $v5,  $v6,
+          $v7,  $v8,  $v9,  $v10, $v11, $v12,
+          $v13, $v14, $v15, $v16, $v17, $v18,
+          $v19, $v20, $v21, $v22, $v23, $v24,
+) = map("v$_",(0..24));
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkned_decrypt
+.type rv64i_zvkned_decrypt,\@function
+rv64i_zvkned_decrypt:
+    # Load number of rounds
+    lwu     $rounds, 240($KEYP)
+
+    # Get proper routine for key size
+    li $T6, 14
+    beq $rounds, $T6, L_dec_256
+    li $T6, 10
+    beq $rounds, $T6, L_dec_128
+
+    j L_fail_m2
+.size rv64i_zvkned_decrypt,.-rv64i_zvkned_decrypt
+___
+
+$code .= <<___;
+.p2align 3
+L_dec_128:
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    @{[vle32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v20, ($KEYP)]}
+
+    @{[vle32_v $v1, ($INP)]}
+
+    @{[vaesz_vs $v1, $v20]}    # with round key w[43,47]
+    @{[vaesdm_vs $v1, $v19]}   # with round key w[36,39]
+    @{[vaesdm_vs $v1, $v18]}   # with round key w[32,35]
+    @{[vaesdm_vs $v1, $v17]}   # with round key w[28,31]
+    @{[vaesdm_vs $v1, $v16]}   # with round key w[24,27]
+    @{[vaesdm_vs $v1, $v15]}   # with round key w[20,23]
+    @{[vaesdm_vs $v1, $v14]}   # with round key w[16,19]
+    @{[vaesdm_vs $v1, $v13]}   # with round key w[12,15]
+    @{[vaesdm_vs $v1, $v12]}   # with round key w[ 8,11]
+    @{[vaesdm_vs $v1, $v11]}   # with round key w[ 4, 7]
+    @{[vaesdf_vs $v1, $v10]}   # with round key w[ 0, 3]
+
+    @{[vse32_v $v1, ($OUTP)]}
+
+    ret
+.size L_dec_128,.-L_dec_128
+___
+
+$code .= <<___;
+.p2align 3
+L_dec_256:
+    @{[vsetivli__x0_4_e32_m1_ta_ma]}
+
+    @{[vle32_v $v10, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v11, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v12, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v13, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v14, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v15, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v16, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v17, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v18, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v19, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v20, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v21, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v22, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v23, ($KEYP)]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $v24, ($KEYP)]}
+
+    @{[vle32_v $v1, ($INP)]}
+
+    @{[vaesz_vs $v1, $v24]}    # with round key w[56,59]
+    @{[vaesdm_vs $v1, $v23]}   # with round key w[52,55]
+    @{[vaesdm_vs $v1, $v22]}   # with round key w[48,51]
+    @{[vaesdm_vs $v1, $v21]}   # with round key w[44,47]
+    @{[vaesdm_vs $v1, $v20]}   # with round key w[40,43]
+    @{[vaesdm_vs $v1, $v19]}   # with round key w[36,39]
+    @{[vaesdm_vs $v1, $v18]}   # with round key w[32,35]
+    @{[vaesdm_vs $v1, $v17]}   # with round key w[28,31]
+    @{[vaesdm_vs $v1, $v16]}   # with round key w[24,27]
+    @{[vaesdm_vs $v1, $v15]}   # with round key w[20,23]
+    @{[vaesdm_vs $v1, $v14]}   # with round key w[16,19]
+    @{[vaesdm_vs $v1, $v13]}   # with round key w[12,15]
+    @{[vaesdm_vs $v1, $v12]}   # with round key w[ 8,11]
+    @{[vaesdm_vs $v1, $v11]}   # with round key w[ 4, 7]
+    @{[vaesdf_vs $v1, $v10]}   # with round key w[ 0, 3]
+
+    @{[vse32_v $v1, ($OUTP)]}
+
+    ret
+.size L_dec_256,.-L_dec_256
+___
+}
+
+$code .= <<___;
+L_fail_m1:
+    li a0, -1
+    ret
+.size L_fail_m1,.-L_fail_m1
+
+L_fail_m2:
+    li a0, -2
+    ret
+.size L_fail_m2,.-L_fail_m2
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";