diff mbox series

[v7,1/2] target/s390x: support SHA-512 extensions

Message ID 20220809150331.84296-1-Jason@zx2c4.com (mailing list archive)
State New, archived
Headers show
Series [v7,1/2] target/s390x: support SHA-512 extensions | expand

Commit Message

Jason A. Donenfeld Aug. 9, 2022, 3:03 p.m. UTC
In order to fully support MSA_EXT_5, we have to support the SHA-512
special instructions. So implement those.

The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.

Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 target/s390x/gen-features.c      |   3 +
 target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)

Comments

Thomas Huth Aug. 26, 2022, 10:21 a.m. UTC | #1
Finally, I'm also having some spare minutes to have a look on this ... 
First, thank you for your work here, it's very appreciated! Some more 
comments inline below (mostly cosmetics since I'm not very much into this 
crypto stuff)...

On 09/08/2022 17.03, Jason A. Donenfeld wrote:
> In order to fully support MSA_EXT_5, we have to support the SHA-512
> special instructions. So implement those.
> 
> The implementation began as something TweetNacl-like, and then was
> adjusted to be useful here. It's not very beautiful, but it is quite
> short and compact, which is what we're going for.
 >
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>   target/s390x/gen-features.c      |   3 +
>   target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++
>   2 files changed, 160 insertions(+)

If you've got some spare time, it would be great to have a test for the new 
functions in the tests/tcg/s390x/ folder, too (but otherwise we can still 
add them later).

> diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
> index ad140184b9..85ab69d04e 100644
> --- a/target/s390x/gen-features.c
> +++ b/target/s390x/gen-features.c
> @@ -749,6 +749,9 @@ static uint16_t qemu_V7_0[] = {
>    */
>   static uint16_t qemu_MAX[] = {
>       S390_FEAT_VECTOR_ENH2,
> +    S390_FEAT_MSA_EXT_5,
> +    S390_FEAT_KIMD_SHA_512,
> +    S390_FEAT_KLMD_SHA_512,
>   };

I think we likely have to fence the bits off for older machine type 
versions, like it has been done in commit 4f9b6c7ddb2 for example. However, 
the patch for the new 7.2 machine type is not merged yet (but I've queued it 
on https://gitlab.com/thuth/qemu/-/commits/s390x-next/ ), so you either have 
to pick that manually into your branch, or we fix it up later (which would 
be ok for me, too).

>   /****** END FEATURE DEFS ******/
> diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
> index 138d9e7ad9..4d45de8faa 100644
> --- a/target/s390x/tcg/crypto_helper.c
> +++ b/target/s390x/tcg/crypto_helper.c
> @@ -1,10 +1,12 @@
>   /*
>    *  s390x crypto helpers
>    *
> + *  Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.

Please drop the "All rights reserved" ... it does not have any legal meaning 
anymore, and also sounds weird in the Open Source context. See:

  https://en.wikipedia.org/wiki/All_rights_reserved#Obsolescence

>    *  Copyright (c) 2017 Red Hat Inc
>    *
>    *  Authors:
>    *   David Hildenbrand <david@redhat.com>
> + *   Jason A. Donenfeld <Jason@zx2c4.com>
>    *
>    * This work is licensed under the terms of the GNU GPL, version 2 or later.
>    * See the COPYING file in the top-level directory.
> @@ -18,6 +20,153 @@
>   #include "exec/exec-all.h"
>   #include "exec/cpu_ldst.h"
>   
> +static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
> +static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
> +static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
> +static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
> +static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
> +static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
> +static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
> +
> +static const uint64_t K[80] = {
> +    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
> +    0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
> +    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
> +    0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
> +    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
> +    0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
> +    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
> +    0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
> +    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
> +    0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
> +    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
> +    0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
> +    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
> +    0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
> +    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
> +    0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
> +    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
> +    0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
> +    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
> +    0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
> +    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
> +    0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
> +    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
> +    0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
> +    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
> +    0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
> +    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
> +};
> +
> +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
> +                       uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
> +{
> +    enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
> +    uint64_t z[8], b[8], a[8], w[16], t;
> +    uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;

The line is very long, could you please declare message and len on separate 
lines?

> +    int i, j, message_reg_len = 64, blocks = 0, cc = 0;
> +
> +    if (!(env->psw.mask & PSW_MASK_64)) {
> +        len = (uint32_t)len;
> +        message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
> +    }
> +
> +    for (i = 0; i < 8; ++i) {
> +        z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);

Quite a long line again, maybe split it like this:

       abi_ptr addr = wrap_address(env, parameter_block + 8 * i);
       z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra);

> +    }
> +
> +    while (len >= 128) {
> +        if (++blocks > MAX_BLOCKS_PER_RUN) {
> +            cc = 3;
> +            break;
> +        }
> +
> +        for (i = 0; i < 16; ++i) {
> +            if (message) {
> +                w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);

Long line again, please split.

> +            } else {
> +                w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
> +            }
> +        }
> +
> +        for (i = 0; i < 80; ++i) {
> +            for (j = 0; j < 8; ++j) {
> +                b[j] = a[j];
> +            }
> +            t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
> +            b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
> +            b[3] += t;
> +            for (j = 0; j < 8; ++j) {
> +                a[(j + 1) % 8] = b[j];
> +            }
> +            if (i % 16 == 15) {
> +                for (j = 0; j < 16; ++j) {
> +                    w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
> +                }
> +            }
> +        }
> +
> +        for (i = 0; i < 8; ++i) {
> +            a[i] += z[i];
> +            z[i] = a[i];
> +        }
> +
> +        if (message) {
> +            message += 128;
> +        } else {
> +            stack_buffer += 128;
> +        }
> +        len -= 128;
> +        processed += 128;
> +    }
> +
> +    for (i = 0; i < 8; ++i) {
> +        cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
> +    }
> +
> +    if (message_reg) {
> +        *message_reg = deposit64(*message_reg, 0, message_reg_len, message);
> +    }
> +    *len_reg -= processed;
> +    return cc;
> +}
> +
> +static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
> +                        uint64_t *message_reg, uint64_t *len_reg)
> +{
> +    uint8_t x[256];
> +    uint64_t i, message, len;
> +    int j, message_reg_len = 64, cc;
> +
> +    cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
> +    if (cc) {
> +        return cc;
> +    }
> +
> +    message = *message_reg;
> +    len = *len_reg;
> +    if (!(env->psw.mask & PSW_MASK_64)) {
> +        len = (uint32_t)len;
> +        message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
> +    }
> +
> +    for (i = 0; i < len; ++i) {
> +        x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
> +    }
> +    memset(x + i, 0, sizeof(x) - i);
> +    x[i] = 128;
> +    i = i < 112 ? 128 : 256;
> +    for (j = 0; j < 16; ++j) {
> +        x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
> +    }
> +    if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) {
> +        g_assert_not_reached(); /* It must handle at least 2 blocks. */
> +    }
> +    *message_reg = deposit64(*message_reg, 0, message_reg_len, message + len);
> +    *len_reg -= len;
> +    return 0;
> +}
> +
>   uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
>                        uint32_t type)
>   {
> @@ -52,6 +201,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
>               cpu_stb_data_ra(env, param_addr, subfunc[i], ra);

So for KIMD and KLMD, I think we now have to set the bit that corresponds to 
SHA-512 in the query status information, too? Otherwise the guest might not 
use the function if it thinks that it is not available?

>           }
>           break;
> +    case 3: /* CPACF_*_SHA_512 */
> +        switch (type) {
> +        case S390_FEAT_TYPE_KIMD:
> +            return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
> +        case S390_FEAT_TYPE_KLMD:
> +            return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
> +        }
> +        break;
>       default:
>           /* we don't implement any other subfunction yet */
>           g_assert_not_reached();

  Thomas
Jason A. Donenfeld Aug. 29, 2022, 4:27 p.m. UTC | #2
On Fri, Aug 26, 2022 at 12:21:36PM +0200, Thomas Huth wrote:
> > + *  Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> 
> Please drop the "All rights reserved" ... it does not have any legal meaning 

No.

> > +{
> > +    enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
> > +    uint64_t z[8], b[8], a[8], w[16], t;
> > +    uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
> 
> The line is very long, could you please declare message and len on separate 
> lines?

Will do.

> 
> > +    int i, j, message_reg_len = 64, blocks = 0, cc = 0;
> > +
> > +    if (!(env->psw.mask & PSW_MASK_64)) {
> > +        len = (uint32_t)len;
> > +        message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
> > +    }
> > +
> > +    for (i = 0; i < 8; ++i) {
> > +        z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
> 
> Quite a long line again, maybe split it like this:
> 
>        abi_ptr addr = wrap_address(env, parameter_block + 8 * i);
>        z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra);

Sure.

> 
> > +    }
> > +
> > +    while (len >= 128) {
> > +        if (++blocks > MAX_BLOCKS_PER_RUN) {
> > +            cc = 3;
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < 16; ++i) {
> > +            if (message) {
> > +                w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
> 
> Long line again, please split.

Okay.

> >               cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
> 
> So for KIMD and KLMD, I think we now have to set the bit that corresponds to 
> SHA-512 in the query status information, too? Otherwise the guest might not 
> use the function if it thinks that it is not available?

That's already taken care of generically I think. This works fine from
Linux's autodetection.

Jason
diff mbox series

Patch

diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..85ab69d04e 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,9 @@  static uint16_t qemu_V7_0[] = {
  */
 static uint16_t qemu_MAX[] = {
     S390_FEAT_VECTOR_ENH2,
+    S390_FEAT_MSA_EXT_5,
+    S390_FEAT_KIMD_SHA_512,
+    S390_FEAT_KLMD_SHA_512,
 };
 
 /****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..4d45de8faa 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -1,10 +1,12 @@ 
 /*
  *  s390x crypto helpers
  *
+ *  Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  *  Copyright (c) 2017 Red Hat Inc
  *
  *  Authors:
  *   David Hildenbrand <david@redhat.com>
+ *   Jason A. Donenfeld <Jason@zx2c4.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -18,6 +20,153 @@ 
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+    0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+    0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+    0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+    0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+    0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+    0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+    0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+    0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+    0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+    0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+    0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+    0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+    0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+                       uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+    enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
+    uint64_t z[8], b[8], a[8], w[16], t;
+    uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
+    int i, j, message_reg_len = 64, blocks = 0, cc = 0;
+
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        len = (uint32_t)len;
+        message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+    }
+
+    for (i = 0; i < 8; ++i) {
+        z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+    }
+
+    while (len >= 128) {
+        if (++blocks > MAX_BLOCKS_PER_RUN) {
+            cc = 3;
+            break;
+        }
+
+        for (i = 0; i < 16; ++i) {
+            if (message) {
+                w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
+            } else {
+                w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+            }
+        }
+
+        for (i = 0; i < 80; ++i) {
+            for (j = 0; j < 8; ++j) {
+                b[j] = a[j];
+            }
+            t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+            b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+            b[3] += t;
+            for (j = 0; j < 8; ++j) {
+                a[(j + 1) % 8] = b[j];
+            }
+            if (i % 16 == 15) {
+                for (j = 0; j < 16; ++j) {
+                    w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
+                }
+            }
+        }
+
+        for (i = 0; i < 8; ++i) {
+            a[i] += z[i];
+            z[i] = a[i];
+        }
+
+        if (message) {
+            message += 128;
+        } else {
+            stack_buffer += 128;
+        }
+        len -= 128;
+        processed += 128;
+    }
+
+    for (i = 0; i < 8; ++i) {
+        cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+    }
+
+    if (message_reg) {
+        *message_reg = deposit64(*message_reg, 0, message_reg_len, message);
+    }
+    *len_reg -= processed;
+    return cc;
+}
+
+static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+                        uint64_t *message_reg, uint64_t *len_reg)
+{
+    uint8_t x[256];
+    uint64_t i, message, len;
+    int j, message_reg_len = 64, cc;
+
+    cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+    if (cc) {
+        return cc;
+    }
+
+    message = *message_reg;
+    len = *len_reg;
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        len = (uint32_t)len;
+        message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+    }
+
+    for (i = 0; i < len; ++i) {
+        x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
+    }
+    memset(x + i, 0, sizeof(x) - i);
+    x[i] = 128;
+    i = i < 112 ? 128 : 256;
+    for (j = 0; j < 16; ++j) {
+        x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+    }
+    if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) {
+        g_assert_not_reached(); /* It must handle at least 2 blocks. */
+    }
+    *message_reg = deposit64(*message_reg, 0, message_reg_len, message + len);
+    *len_reg -= len;
+    return 0;
+}
+
 uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
                      uint32_t type)
 {
@@ -52,6 +201,14 @@  uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
             cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
         }
         break;
+    case 3: /* CPACF_*_SHA_512 */
+        switch (type) {
+        case S390_FEAT_TYPE_KIMD:
+            return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+        case S390_FEAT_TYPE_KLMD:
+            return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+        }
+        break;
     default:
         /* we don't implement any other subfunction yet */
         g_assert_not_reached();