Message ID | 20220803171536.1314717-2-Jason@zx2c4.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v6,1/2] target/s390x: support PRNO_TRNG instruction | expand |
On 03.08.22 19:15, Jason A. Donenfeld wrote: > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > special instructions. So implement those. > > The implementation began as something TweetNacl-like, and then was > adjusted to be useful here. It's not very beautiful, but it is quite > short and compact, which is what we're going for. > NIT: we could think about reversing the order of patches. IIRC, patch #1 itself would trigger a warning when starting QEMU. Having this patch first make sense logically. > Cc: Thomas Huth <thuth@redhat.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Christian Borntraeger <borntraeger@linux.ibm.com> > Cc: Richard Henderson <richard.henderson@linaro.org> > Cc: Cornelia Huck <cohuck@redhat.com> > Cc: Harald Freudenberger <freude@linux.ibm.com> > Cc: Holger Dengler <dengler@linux.ibm.com> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > target/s390x/gen-features.c | 2 + > target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++ > 2 files changed, 159 insertions(+) > > diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c > index 3d333e2789..b6d804fa6d 100644 > --- a/target/s390x/gen-features.c > +++ b/target/s390x/gen-features.c > @@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = { > S390_FEAT_VECTOR_ENH2, > S390_FEAT_MSA_EXT_5, > S390_FEAT_PRNO_TRNG, > + S390_FEAT_KIMD_SHA_512, > + S390_FEAT_KLMD_SHA_512, > }; > > /****** END FEATURE DEFS ******/ > diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c > index 8ad4ef1ace..bb4823107c 100644 > --- a/target/s390x/tcg/crypto_helper.c > +++ b/target/s390x/tcg/crypto_helper.c > @@ -1,10 +1,12 @@ > /* > * s390x crypto helpers > * > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > * Copyright (c) 2017 Red Hat Inc > * > * Authors: > * David Hildenbrand <david@redhat.com> > + * Jason A. Donenfeld <Jason@zx2c4.com> > * > * This work is licensed under the terms of the GNU GPL, version 2 or later. > * See the COPYING file in the top-level directory. > @@ -19,6 +21,153 @@ > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > > +static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); } > +static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); } > +static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); } > +static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); } > +static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); } > +static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); } > +static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); } > + > +static const uint64_t K[80] = { > + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, > + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, > + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, > + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, > + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, > + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, > + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, > + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, > + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, > + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, > + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, > + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, > + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, > + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, > + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, > + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, > + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, > + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, > + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, > + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, > + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, > + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, > + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, > + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, > + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, > + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, > + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL > +}; > + > +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > +{ > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ I'd just use a #define outside of the function for that. > + uint64_t z[8], b[8], a[8], w[16], t; > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; > + int i, j, reg_len = 64, blocks = 0, cc = 0; > + > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } I'd call that message_reg_len. (same in other function) > + > + for (i = 0; i < 8; ++i) { > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); I assume if we get any exception here, we simply didn't make any progress. > + } > + > + while (len >= 128) { > + if (++blocks > MAX_BLOCKS_PER_RUN) { > + cc = 3; > + break; > + } > + > + for (i = 0; i < 16; ++i) { > + if (message) { > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); dito > + } else { > + w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]); > + } > + } > + > + for (i = 0; i < 80; ++i) { > + for (j = 0; j < 8; ++j) { > + b[j] = a[j]; > + } > + t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16]; > + b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]); > + b[3] += t; > + for (j = 0; j < 8; ++j) { > + a[(j + 1) % 8] = b[j]; > + } > + if (i % 16 == 15) { > + for (j = 0; j < 16; ++j) { > + w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]); > + } > + } > + } > + > + for (i = 0; i < 8; ++i) { > + a[i] += z[i]; > + z[i] = a[i]; > + } > + > + if (message) { > + message += 128; > + } else { > + stack_buffer += 128; > + } > + len -= 128; > + processed += 128; > + } > + > + for (i = 0; i < 8; ++i) { > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); I wonder what happens if we get an exception somewhere in the middle here ... fortunately we can only involve 2 pages. > + } > + > + if (message_reg) { > + *message_reg = deposit64(*message_reg, 0, reg_len, message); > + } > + *len_reg -= processed; > + return cc; > +} > + > +static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg) > +{ > + uint8_t x[256]; > + uint64_t i, message, len; > + int j, reg_len = 64, cc; > + > + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > + if (cc) { > + return cc; > + } Doesn't kimd_sha512() update the length register? And if we return with cc=3, we'd be in trouble, no? One idea could be to simply only process one block at a time. Read all inputs first for that block and handle it completely without any register modifications. Perform all memory writes in a single call. Further, I wonder if we should factor out the core of kimd_sha512() to only work on temp buffers without any loading/storing of memory, and let only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much cleaner who modifies what. If you run out if ideas, I can give it a shot next week to see if I can clean handling up a bit..
Hi David, On Fri, Aug 05, 2022 at 01:28:18PM +0200, David Hildenbrand wrote: > On 03.08.22 19:15, Jason A. Donenfeld wrote: > > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > > special instructions. So implement those. > > > > The implementation began as something TweetNacl-like, and then was > > adjusted to be useful here. It's not very beautiful, but it is quite > > short and compact, which is what we're going for. > > > > NIT: we could think about reversing the order of patches. IIRC, patch #1 > itself would trigger a warning when starting QEMU. Having this patch > first make sense logically. Good idea. Will do. > > +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > > +{ > > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ > > I'd just use a #define outside of the function for that. Why? What does leaking this into file-level scope do? > > > + uint64_t z[8], b[8], a[8], w[16], t; > > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; > > + int i, j, reg_len = 64, blocks = 0, cc = 0; > > + > > + if (!(env->psw.mask & PSW_MASK_64)) { > > + len = (uint32_t)len; > > + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > > + } > > > I'd call that message_reg_len. (same in other function) Will do. > > > > + > > + for (i = 0; i < 8; ++i) { > > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); > > I assume if we get any exception here, we simply didn't make any progress. > > > + } > > + > > + while (len >= 128) { > > + if (++blocks > MAX_BLOCKS_PER_RUN) { > > + cc = 3; > > + break; > > + } > > + > > + for (i = 0; i < 16; ++i) { > > + if (message) { > > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); > > dito Right, there's no progress, because it's only ever incremented at the end. And, more importantly, we only ever update the parameter_block after having done things successfully. > > + for (i = 0; i < 8; ++i) { > > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); > > I wonder what happens if we get an exception somewhere in the middle > here ... fortunately we can only involve 2 pages. If this fails, then message_reg and len_reg won't be updated, so it will have to start over. If it fails part way through, though, then things are inconsistent. I don't think we want to hassle with trying to restore the previous state or something insane though. That seems a bit much. > > + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > > + if (cc) { > > + return cc; > > + } > > Doesn't kimd_sha512() update the length register? And if we return with > cc=3, we'd be in trouble, no? cc=3 means partial completion. In that case, klmd also returns with a partial completion. That's good and expected! It means that the next time it's called, it'll keep going where it left off. I've actually tried this with the Linux implementation, and it works as expected. > One idea could be to simply only process one block at a time. Read all > inputs first for that block and handle it completely without any > register modifications. Perform all memory writes in a single call. That *is* what already happens. Actually, the memory writes only ever happen at the very end of kimd_sha512. > Further, I wonder if we should factor out the core of kimd_sha512() to > only work on temp buffers without any loading/storing of memory, and let > only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much > cleaner who modifies what. That's not necessary and will complicate things ultimately. See the above; this is already working as expected. Jason
On 05.08.22 15:01, Jason A. Donenfeld wrote: > Hi David, > > On Fri, Aug 05, 2022 at 01:28:18PM +0200, David Hildenbrand wrote: >> On 03.08.22 19:15, Jason A. Donenfeld wrote: >>> In order to fully support MSA_EXT_5, we have to also support the SHA-512 >>> special instructions. So implement those. >>> >>> The implementation began as something TweetNacl-like, and then was >>> adjusted to be useful here. It's not very beautiful, but it is quite >>> short and compact, which is what we're going for. >>> >> >> NIT: we could think about reversing the order of patches. IIRC, patch #1 >> itself would trigger a warning when starting QEMU. Having this patch >> first make sense logically. > > Good idea. Will do. > >>> +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, >>> + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) >>> +{ >>> + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ >> >> I'd just use a #define outside of the function for that. > > Why? What does leaking this into file-level scope do? > I'd say common coding practice in QEMU, but I might be wrong ;) >> >>> + uint64_t z[8], b[8], a[8], w[16], t; >>> + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; >>> + int i, j, reg_len = 64, blocks = 0, cc = 0; >>> + >>> + if (!(env->psw.mask & PSW_MASK_64)) { >>> + len = (uint32_t)len; >>> + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; >>> + } >> [...] > >>> + for (i = 0; i < 8; ++i) { >>> + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); >> >> I wonder what happens if we get an exception somewhere in the middle >> here ... fortunately we can only involve 2 pages. > > If this fails, then message_reg and len_reg won't be updated, so it will > have to start over. If it fails part way through, though, then things > are inconsistent. I don't think we want to hassle with trying to restore > the previous state or something insane though. That seems a bit much. Okay, but there could be scenarios where we mess up? > >>> + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); >>> + if (cc) { >>> + return cc; >>> + } >> >> Doesn't kimd_sha512() update the length register? And if we return with >> cc=3, we'd be in trouble, no? > > cc=3 means partial completion. In that case, klmd also returns with a > partial completion. That's good and expected! It means that the next > time it's called, it'll keep going where it left off. > > I've actually tried this with the Linux implementation, and it works as > expected. > >> One idea could be to simply only process one block at a time. Read all >> inputs first for that block and handle it completely without any >> register modifications. Perform all memory writes in a single call. > > That *is* what already happens. Actually, the memory writes only ever > happen at the very end of kimd_sha512. > >> Further, I wonder if we should factor out the core of kimd_sha512() to >> only work on temp buffers without any loading/storing of memory, and let >> only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much >> cleaner who modifies what. > > That's not necessary and will complicate things ultimately. See the > above; this is already working as expected. I'll have a closer look and see if I might improve it in the upcomming weeks. I'll be on vacation for ~1.5 weeks. And as history has shown, I need some days afterwards to dig through my overflowing mailbox :)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c index 3d333e2789..b6d804fa6d 100644 --- a/target/s390x/gen-features.c +++ b/target/s390x/gen-features.c @@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = { S390_FEAT_VECTOR_ENH2, S390_FEAT_MSA_EXT_5, S390_FEAT_PRNO_TRNG, + S390_FEAT_KIMD_SHA_512, + S390_FEAT_KLMD_SHA_512, }; /****** END FEATURE DEFS ******/ diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c index 8ad4ef1ace..bb4823107c 100644 --- a/target/s390x/tcg/crypto_helper.c +++ b/target/s390x/tcg/crypto_helper.c @@ -1,10 +1,12 @@ /* * s390x crypto helpers * + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright (c) 2017 Red Hat Inc * * Authors: * David Hildenbrand <david@redhat.com> + * Jason A. Donenfeld <Jason@zx2c4.com> * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. @@ -19,6 +21,153 @@ #include "exec/exec-all.h" #include "exec/cpu_ldst.h" +static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); } +static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); } +static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); } +static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); } +static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); } +static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); } +static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); } + +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) +{ + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ + uint64_t z[8], b[8], a[8], w[16], t; + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; + int i, j, reg_len = 64, blocks = 0, cc = 0; + + if (!(env->psw.mask & PSW_MASK_64)) { + len = (uint32_t)len; + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; + } + + for (i = 0; i < 8; ++i) { + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); + } + + while (len >= 128) { + if (++blocks > MAX_BLOCKS_PER_RUN) { + cc = 3; + break; + } + + for (i = 0; i < 16; ++i) { + if (message) { + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); + } else { + w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]); + } + } + + for (i = 0; i < 80; ++i) { + for (j = 0; j < 8; ++j) { + b[j] = a[j]; + } + t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16]; + b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]); + b[3] += t; + for (j = 0; j < 8; ++j) { + a[(j + 1) % 8] = b[j]; + } + if (i % 16 == 15) { + for (j = 0; j < 16; ++j) { + w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]); + } + } + } + + for (i = 0; i < 8; ++i) { + a[i] += z[i]; + z[i] = a[i]; + } + + if (message) { + message += 128; + } else { + stack_buffer += 128; + } + len -= 128; + processed += 128; + } + + for (i = 0; i < 8; ++i) { + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); + } + + if (message_reg) { + *message_reg = deposit64(*message_reg, 0, reg_len, message); + } + *len_reg -= processed; + return cc; +} + +static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, + uint64_t *message_reg, uint64_t *len_reg) +{ + uint8_t x[256]; + uint64_t i, message, len; + int j, reg_len = 64, cc; + + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); + if (cc) { + return cc; + } + + message = *message_reg; + len = *len_reg; + if (!(env->psw.mask & PSW_MASK_64)) { + len = (uint32_t)len; + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; + } + + for (i = 0; i < len; ++i) { + x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra); + } + memset(x + i, 0, sizeof(x) - i); + x[i] = 128; + i = i < 112 ? 128 : 256; + for (j = 0; j < 16; ++j) { + x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra); + } + if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) { + g_assert_not_reached(); /* It must handle at least 2 blocks. */ + } + *message_reg = deposit64(*message_reg, 0, reg_len, message + len); + *len_reg -= len; + return 0; +} + static void fill_buf_random(CPUS390XState *env, uintptr_t ra, uint64_t *buf_reg, uint64_t *len_reg) { @@ -78,6 +227,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, cpu_stb_data_ra(env, param_addr, subfunc[i], ra); } break; + case 3: /* CPACF_*_SHA_512 */ + switch (type) { + case S390_FEAT_TYPE_KIMD: + return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL); + case S390_FEAT_TYPE_KLMD: + return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]); + } + break; case 114: /* CPACF_PRNO_TRNG */ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
In order to fully support MSA_EXT_5, we have to also support the SHA-512 special instructions. So implement those. The implementation began as something TweetNacl-like, and then was adjusted to be useful here. It's not very beautiful, but it is quite short and compact, which is what we're going for. Cc: Thomas Huth <thuth@redhat.com> Cc: David Hildenbrand <david@redhat.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Richard Henderson <richard.henderson@linaro.org> Cc: Cornelia Huck <cohuck@redhat.com> Cc: Harald Freudenberger <freude@linux.ibm.com> Cc: Holger Dengler <dengler@linux.ibm.com> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- target/s390x/gen-features.c | 2 + target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+)