Message ID | 20250115164657.84650-2-freude@linux.ibm.com (mailing list archive) |
---|---|
State | Rejected, archived |
Delegated to: | Mikulas Patocka |
Headers | show |
Series | dm-integrity: Implement asynch digest support | expand |
Hi The ahash interface is slower than the shash interface for synchronous implementations, so the patch is basically slowing down the common case. See the upstream commit b76ad8844234bd0d394105d7d784cd05f1bf269a for an explanation in dm-verity. Do you have some benchmark that shows how much does it help on s390x? So, that we can evaluate whether the added complexity is worth the performance improvement or not. Mikulas On Wed, 15 Jan 2025, Harald Freudenberger wrote: > Use the async digest in-kernel crypto API instead of the > synchronous digest API. This has the advantage of being able > to use synchronous as well as asynchronous digest implementations > as the in-kernel API has an automatic wrapping mechanism > to provide all synchronous digests via the asynch API. > > Tested with crc32, sha256, hmac-sha256 and the s390 specific > implementations for hmac-sha256 and protected key phmac-sha256. > > Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> > --- > drivers/md/dm-integrity.c | 220 ++++++++++++++++++++++++-------------- > 1 file changed, 138 insertions(+), 82 deletions(-) > > diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c > index ee9f7cecd78e..1504db9276d1 100644 > --- a/drivers/md/dm-integrity.c > +++ b/drivers/md/dm-integrity.c > @@ -195,7 +195,7 @@ struct dm_integrity_c { > struct scatterlist **journal_io_scatterlist; > struct skcipher_request **sk_requests; > > - struct crypto_shash *journal_mac; > + struct crypto_ahash *journal_mac; > > struct journal_node *journal_tree; > struct rb_root journal_tree_root; > @@ -221,7 +221,7 @@ struct dm_integrity_c { > > int failed; > > - struct crypto_shash *internal_hash; > + struct crypto_ahash *internal_hash; > > struct dm_target *ti; > > @@ -488,11 +488,14 @@ static void sb_set_version(struct dm_integrity_c *ic) > > static int sb_mac(struct dm_integrity_c *ic, bool wr) > { > - SHASH_DESC_ON_STACK(desc, ic->journal_mac); > - int r; > - unsigned int mac_size = crypto_shash_digestsize(ic->journal_mac); > + unsigned int mac_size = crypto_ahash_digestsize(ic->journal_mac); > __u8 *sb = (__u8 *)ic->sb; > __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size; > + struct ahash_request *req; > + DECLARE_CRYPTO_WAIT(wait); > + struct scatterlist sg; > + unsigned int nbytes; > + int r; > > if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT || > mac_size > HASH_MAX_DIGESTSIZE) { > @@ -500,29 +503,44 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) > return -EINVAL; > } > > - desc->tfm = ic->journal_mac; > + req = ahash_request_alloc(ic->journal_mac, GFP_KERNEL); > + if (unlikely(!req)) { > + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); > + return -ENOMEM; > + } > + ahash_request_set_callback(req, 0, crypto_req_done, &wait); > + > + sg_init_table(&sg, 1); > + nbytes = mac - sb; > + sg_set_buf(&sg, sb, nbytes); > > if (likely(wr)) { > - r = crypto_shash_digest(desc, sb, mac - sb, mac); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_digest", r); > + ahash_request_set_crypt(req, &sg, mac, nbytes); > + r = crypto_wait_req(crypto_ahash_digest(req), &wait); > + if (unlikely(r)) { > + dm_integrity_io_error(ic, "crypto_ahash_digest", r); > + ahash_request_free(req); > return r; > } > } else { > __u8 actual_mac[HASH_MAX_DIGESTSIZE]; > > - r = crypto_shash_digest(desc, sb, mac - sb, actual_mac); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_digest", r); > + ahash_request_set_crypt(req, &sg, actual_mac, nbytes); > + r = crypto_wait_req(crypto_ahash_digest(req), &wait); > + if (unlikely(r)) { > + dm_integrity_io_error(ic, "crypto_ahash_digest", r); > + ahash_request_free(req); > return r; > } > if (memcmp(mac, actual_mac, mac_size)) { > dm_integrity_io_error(ic, "superblock mac", -EILSEQ); > dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); > + ahash_request_free(req); > return -EILSEQ; > } > } > > + ahash_request_free(req); > return 0; > } > > @@ -775,51 +793,62 @@ static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, uns > > static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 result[JOURNAL_MAC_SIZE]) > { > - SHASH_DESC_ON_STACK(desc, ic->journal_mac); > + unsigned int j, size, nsg, nbytes = 0; > + struct scatterlist *sg = NULL, *s; > + struct ahash_request *req = NULL; > + DECLARE_CRYPTO_WAIT(wait); > + __le64 *section_le = NULL; > int r; > - unsigned int j, size; > > - desc->tfm = ic->journal_mac; > + req = ahash_request_alloc(ic->journal_mac, GFP_KERNEL); > + if (unlikely(!req)) { > + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); > + goto err; > + } > + ahash_request_set_callback(req, 0, crypto_req_done, &wait); > > - r = crypto_shash_init(desc); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_init", r); > + nsg = ic->journal_section_entries; > + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) > + nsg += 2; > + sg = kmalloc_array(nsg, sizeof(*sg), GFP_KERNEL); > + if (unlikely(!sg)) { > + dm_integrity_io_error(ic, "kmalloc_array", -ENOMEM); > goto err; > } > + sg_init_table(sg, nsg); > + s = sg; > > if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { > - __le64 section_le; > - > - r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > - goto err; > - } > + sg_set_buf(s, (__u8 *)&ic->sb->salt, SALT_SIZE); > + nbytes += SALT_SIZE; > + s++; > > - section_le = cpu_to_le64(section); > - r = crypto_shash_update(desc, (__u8 *)§ion_le, sizeof(section_le)); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > + section_le = kmalloc(sizeof(__le64), GFP_KERNEL); > + if (unlikely(!section_le)) { > + dm_integrity_io_error(ic, "kmalloc(sizeof(__le64))", -ENOMEM); > goto err; > } > + *section_le = cpu_to_le64(section); > + sg_set_buf(s, (__u8 *)section_le, sizeof(*section_le)); > + nbytes += sizeof(*section_le); > + s++; > } > > for (j = 0; j < ic->journal_section_entries; j++) { > struct journal_entry *je = access_journal_entry(ic, section, j); > > - r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof(je->u.sector)); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > - goto err; > - } > + sg_set_buf(s, (__u8 *)&je->u.sector, sizeof(je->u.sector)); > + nbytes += sizeof(je->u.sector); > + s++; > } > > - size = crypto_shash_digestsize(ic->journal_mac); > + size = crypto_ahash_digestsize(ic->journal_mac); > > if (likely(size <= JOURNAL_MAC_SIZE)) { > - r = crypto_shash_final(desc, result); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_final", r); > + ahash_request_set_crypt(req, sg, result, nbytes); > + r = crypto_wait_req(crypto_ahash_digest(req), &wait); > + if (unlikely(r)) { > + dm_integrity_io_error(ic, "crypto_ahash_digest", r); > goto err; > } > memset(result + size, 0, JOURNAL_MAC_SIZE - size); > @@ -830,16 +859,24 @@ static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 re > dm_integrity_io_error(ic, "digest_size", -EINVAL); > goto err; > } > - r = crypto_shash_final(desc, digest); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_final", r); > + ahash_request_set_crypt(req, sg, digest, nbytes); > + r = crypto_wait_req(crypto_ahash_digest(req), &wait); > + if (unlikely(r)) { > + dm_integrity_io_error(ic, "crypto_ahash_digest", r); > goto err; > } > memcpy(result, digest, JOURNAL_MAC_SIZE); > } > > + ahash_request_free(req); > + kfree(section_le); > + kfree(sg); > return; > + > err: > + ahash_request_free(req); > + kfree(section_le); > + kfree(sg); > memset(result, 0, JOURNAL_MAC_SIZE); > } > > @@ -1637,53 +1674,65 @@ static void integrity_end_io(struct bio *bio) > static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, > const char *data, char *result) > { > - __le64 sector_le = cpu_to_le64(sector); > - SHASH_DESC_ON_STACK(req, ic->internal_hash); > - int r; > + struct ahash_request *req = NULL; > + struct scatterlist sg[3], *s; > + DECLARE_CRYPTO_WAIT(wait); > + __le64 *sector_le = NULL; > unsigned int digest_size; > + unsigned int nbytes = 0; > + int r; > > - req->tfm = ic->internal_hash; > - > - r = crypto_shash_init(req); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_init", r); > + req = ahash_request_alloc(ic->internal_hash, GFP_KERNEL); > + if (unlikely(!req)) { > + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); > goto failed; > } > + ahash_request_set_callback(req, 0, crypto_req_done, &wait); > > + s = sg; > if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { > - r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > - goto failed; > - } > + sg_init_table(sg, 3); > + sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE); > + nbytes += SALT_SIZE; > + s++; > + } else { > + sg_init_table(sg, 2); > } > > - r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof(sector_le)); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > + sector_le = kmalloc(sizeof(__le64), GFP_KERNEL); > + if (unlikely(!sector_le)) { > + dm_integrity_io_error(ic, "kmalloc(sizeof(__le64))", -ENOMEM); > goto failed; > } > + *sector_le = cpu_to_le64(sector); > + sg_set_buf(s, (const __u8 *)sector_le, sizeof(*sector_le)); > + nbytes += sizeof(*sector_le); > + s++; > > - r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_update", r); > - goto failed; > - } > + sg_set_buf(s, data, ic->sectors_per_block << SECTOR_SHIFT); > + nbytes += ic->sectors_per_block << SECTOR_SHIFT; > + > + ahash_request_set_crypt(req, sg, result, nbytes); > > - r = crypto_shash_final(req, result); > - if (unlikely(r < 0)) { > - dm_integrity_io_error(ic, "crypto_shash_final", r); > + r = crypto_wait_req(crypto_ahash_digest(req), &wait); > + if (r) { > + dm_integrity_io_error(ic, "crypto_ahash_digest", r); > goto failed; > } > > - digest_size = crypto_shash_digestsize(ic->internal_hash); > + digest_size = crypto_ahash_digestsize(ic->internal_hash); > if (unlikely(digest_size < ic->tag_size)) > memset(result + digest_size, 0, ic->tag_size - digest_size); > > + ahash_request_free(req); > + kfree(sector_le); > + > return; > > failed: > /* this shouldn't happen anyway, the hash functions have no reason to fail */ > + ahash_request_free(req); > + kfree(sector_le); > get_random_bytes(result, ic->tag_size); > } > > @@ -1776,7 +1825,7 @@ static void integrity_metadata(struct work_struct *w) > if (ic->internal_hash) { > struct bvec_iter iter; > struct bio_vec bv; > - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); > + unsigned int digest_size = crypto_ahash_digestsize(ic->internal_hash); > struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); > char *checksums; > unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; > @@ -2124,7 +2173,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, > } while (++s < ic->sectors_per_block); > > if (ic->internal_hash) { > - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); > + unsigned int digest_size = crypto_ahash_digestsize(ic->internal_hash); > > if (unlikely(digest_size > ic->tag_size)) { > char checksums_onstack[HASH_MAX_DIGESTSIZE]; > @@ -2428,7 +2477,7 @@ static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map) > if (!dio->integrity_payload) { > unsigned digest_size, extra_size; > dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); > - digest_size = crypto_shash_digestsize(ic->internal_hash); > + digest_size = crypto_ahash_digestsize(ic->internal_hash); > extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; > dio->payload_len += extra_size; > dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); > @@ -2595,7 +2644,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) > bio_put(outgoing_bio); > > integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); > - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { > + if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_ahash_digestsize(ic->internal_hash), ic->tag_size)))) { > DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", > ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); > atomic64_inc(&ic->number_of_mismatches); > @@ -2635,7 +2684,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status > //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); > integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); > if (unlikely(memcmp(digest, dio->integrity_payload + pos, > - min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { > + min(crypto_ahash_digestsize(ic->internal_hash), ic->tag_size)))) { > kunmap_local(mem); > dm_integrity_free_payload(dio); > INIT_WORK(&dio->work, dm_integrity_inline_recheck); > @@ -3017,8 +3066,8 @@ static void integrity_recalc(struct work_struct *w) > goto free_ret; > } > recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; > - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) > - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; > + if (crypto_ahash_digestsize(ic->internal_hash) > ic->tag_size) > + recalc_tags_size += crypto_ahash_digestsize(ic->internal_hash) - ic->tag_size; > recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); > if (!recalc_tags) { > vfree(recalc_buffer); > @@ -3177,8 +3226,8 @@ static void integrity_recalc_inline(struct work_struct *w) > } > > recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; > - if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) > - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; > + if (crypto_ahash_digestsize(ic->internal_hash) > ic->tuple_size) > + recalc_tags_size += crypto_ahash_digestsize(ic->internal_hash) - ic->tuple_size; > recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); > if (!recalc_tags) { > kfree(recalc_buffer); > @@ -4187,6 +4236,8 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch > if (!a->alg_string) > goto nomem; > > + DEBUG_print("%s: alg_string=%s\n", __func__, a->alg_string); > + > k = strchr(a->alg_string, ':'); > if (k) { > *k = 0; > @@ -4198,6 +4249,9 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch > a->key = kmalloc(a->key_size, GFP_KERNEL); > if (!a->key) > goto nomem; > + > + DEBUG_print("%s: key=%s\n", __func__, a->key_string); > + > if (hex2bin(a->key, a->key_string, a->key_size)) > goto inval; > } > @@ -4211,13 +4265,15 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch > return -ENOMEM; > } > > -static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, > +static int get_mac(struct crypto_ahash **hash, struct alg_spec *a, char **error, > char *error_alg, char *error_key) > { > int r; > > + DEBUG_print(">%s\n", __func__); > + > if (a->alg_string) { > - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); > + *hash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); > if (IS_ERR(*hash)) { > *error = error_alg; > r = PTR_ERR(*hash); > @@ -4226,12 +4282,12 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, > } > > if (a->key) { > - r = crypto_shash_setkey(*hash, a->key, a->key_size); > + r = crypto_ahash_setkey(*hash, a->key, a->key_size); > if (r) { > *error = error_key; > return r; > } > - } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { > + } else if (crypto_ahash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { > *error = error_key; > return -ENOKEY; > } > @@ -4707,7 +4763,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv > r = -EINVAL; > goto bad; > } > - ic->tag_size = crypto_shash_digestsize(ic->internal_hash); > + ic->tag_size = crypto_ahash_digestsize(ic->internal_hash); > } > if (ic->tag_size > MAX_TAG_SIZE) { > ti->error = "Too big tag size"; > @@ -5226,7 +5282,7 @@ static void dm_integrity_dtr(struct dm_target *ti) > free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); > > if (ic->internal_hash) > - crypto_free_shash(ic->internal_hash); > + crypto_free_ahash(ic->internal_hash); > free_alg(&ic->internal_hash_alg); > > if (ic->journal_crypt) > @@ -5234,7 +5290,7 @@ static void dm_integrity_dtr(struct dm_target *ti) > free_alg(&ic->journal_crypt_alg); > > if (ic->journal_mac) > - crypto_free_shash(ic->journal_mac); > + crypto_free_ahash(ic->journal_mac); > free_alg(&ic->journal_mac_alg); > > kfree(ic); > -- > 2.43.0 >
On Wed, Jan 15, 2025 at 05:46:57PM +0100, Harald Freudenberger wrote: > Use the async digest in-kernel crypto API instead of the > synchronous digest API. This has the advantage of being able > to use synchronous as well as asynchronous digest implementations > as the in-kernel API has an automatic wrapping mechanism > to provide all synchronous digests via the asynch API. > > Tested with crc32, sha256, hmac-sha256 and the s390 specific > implementations for hmac-sha256 and protected key phmac-sha256. > > Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> As Mikulas mentioned, this reduces performance for everyone else, which is not great. It also makes the code more complicated. I also see that you aren't actually using the algorithm in an async manner, but rather waiting for it synchronously each time. Thus the ability to operate asynchronously provides no benefit in this case, and this change is purely about allowing a particular driver to be used, presumably the s390 phmac one from your recent patchset. Since s390 phmac seems to be new code, and furthermore it is CPU-based and thus uses virtual addresses (which makes the use of scatterlists entirely pointless), wouldn't it be easier to just make it implement shash instead of ahash, moving any wait that may be necessary into the driver itself? - Eric
On 2025-01-15 18:37, Eric Biggers wrote: > On Wed, Jan 15, 2025 at 05:46:57PM +0100, Harald Freudenberger wrote: >> Use the async digest in-kernel crypto API instead of the >> synchronous digest API. This has the advantage of being able >> to use synchronous as well as asynchronous digest implementations >> as the in-kernel API has an automatic wrapping mechanism >> to provide all synchronous digests via the asynch API. >> >> Tested with crc32, sha256, hmac-sha256 and the s390 specific >> implementations for hmac-sha256 and protected key phmac-sha256. >> >> Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> > > As Mikulas mentioned, this reduces performance for everyone else, which > is not > great. It also makes the code more complicated. > > I also see that you aren't actually using the algorithm in an async > manner, but > rather waiting for it synchronously each time. Thus the ability to > operate > asynchronously provides no benefit in this case, and this change is > purely about > allowing a particular driver to be used, presumably the s390 phmac one > from your > recent patchset. Since s390 phmac seems to be new code, and > furthermore it is > CPU-based and thus uses virtual addresses (which makes the use of > scatterlists > entirely pointless), wouldn't it be easier to just make it implement > shash > instead of ahash, moving any wait that may be necessary into the driver > itself? > > - Eric Thanks for this feedback. I'll give it a try with some performance measurements. And I totally agree that a synchronous implementation of phmac whould have solved this also. But maybe you can see that this is not an option according to Herbert Xu's feedback about my first posts with implementing phmac as an shash. The thing is that we have to derive a hardware based key (pkey) from the given key material and that may be a sleeping call which a shash must not invoke. So finally the phmac implementation is now an ahash digest implementation as suggested by Herbert. You are right, my patch is not really asynchronous. Or at least waiting for completion at the end of each function. However, opposed to the ahash invocation where there have been some update() calls this is now done in just one digest() giving the backing algorithm a chance to hash all this in one step (well it still needs to walk the scatterlist). Is there a way to have dm-integrity accept both, a ahash() or a shash() digest?
On Thu, Jan 16, 2025 at 08:33:46AM +0100, Harald Freudenberger wrote: > On 2025-01-15 18:37, Eric Biggers wrote: > > On Wed, Jan 15, 2025 at 05:46:57PM +0100, Harald Freudenberger wrote: > > > Use the async digest in-kernel crypto API instead of the > > > synchronous digest API. This has the advantage of being able > > > to use synchronous as well as asynchronous digest implementations > > > as the in-kernel API has an automatic wrapping mechanism > > > to provide all synchronous digests via the asynch API. > > > > > > Tested with crc32, sha256, hmac-sha256 and the s390 specific > > > implementations for hmac-sha256 and protected key phmac-sha256. > > > > > > Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> > > > > As Mikulas mentioned, this reduces performance for everyone else, which > > is not > > great. It also makes the code more complicated. > > > > I also see that you aren't actually using the algorithm in an async > > manner, but > > rather waiting for it synchronously each time. Thus the ability to > > operate > > asynchronously provides no benefit in this case, and this change is > > purely about > > allowing a particular driver to be used, presumably the s390 phmac one > > from your > > recent patchset. Since s390 phmac seems to be new code, and furthermore > > it is > > CPU-based and thus uses virtual addresses (which makes the use of > > scatterlists > > entirely pointless), wouldn't it be easier to just make it implement > > shash > > instead of ahash, moving any wait that may be necessary into the driver > > itself? > > > > - Eric > > Thanks for this feedback. I'll give it a try with some performance > measurements. > And I totally agree that a synchronous implementation of phmac whould have > solved > this also. But maybe you can see that this is not an option according to > Herbert Xu's feedback about my first posts with implementing phmac as an > shash. > The thing is that we have to derive a hardware based key (pkey) from the > given key material and that may be a sleeping call which a shash must not > invoke. > So finally the phmac implementation is now an ahash digest implementation > as suggested by Herbert. > > You are right, my patch is not really asynchronous. Or at least waiting for > completion at the end of each function. However, opposed to the ahash > invocation > where there have been some update() calls this is now done in just one > digest() > giving the backing algorithm a chance to hash all this in one step (well it > still > needs to walk the scatterlist). > > Is there a way to have dm-integrity accept both, a ahash() or a shash() > digest? > To properly support async algorithms, the users (e.g. dm-integrity and dm-verity) really would need to have separate code paths anyway. The computation models are just too different. But in this case, it seems you simply want it to be synchronous and use virtual addresses. The quirks of ahash, including its need for per-request allocations and scatterlists, make it a poor match here. The only thing you are getting with it is, ironically, that it allows you to wait synchronously. That could be done with shash too if it was fixed to support algorithms that aren't atomic. E.g. there could be a new CRYPTO_ALG_MAY_SLEEP flag that could be set in struct shash_alg to indicate that the algorithm doesn't support atomic context, and a flag could be passed to crypto_alloc_shash() to allow such an algorithm to be selected (if the particular user never uses it in atomic context). That would be faster and simpler than the proposed ahash based version. - Eric
On 2025-01-16 09:03, Eric Biggers wrote: > On Thu, Jan 16, 2025 at 08:33:46AM +0100, Harald Freudenberger wrote: >> On 2025-01-15 18:37, Eric Biggers wrote: >> > On Wed, Jan 15, 2025 at 05:46:57PM +0100, Harald Freudenberger wrote: >> > > Use the async digest in-kernel crypto API instead of the >> > > synchronous digest API. This has the advantage of being able >> > > to use synchronous as well as asynchronous digest implementations >> > > as the in-kernel API has an automatic wrapping mechanism >> > > to provide all synchronous digests via the asynch API. >> > > >> > > Tested with crc32, sha256, hmac-sha256 and the s390 specific >> > > implementations for hmac-sha256 and protected key phmac-sha256. >> > > >> > > Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> >> > >> > As Mikulas mentioned, this reduces performance for everyone else, which >> > is not >> > great. It also makes the code more complicated. >> > >> > I also see that you aren't actually using the algorithm in an async >> > manner, but >> > rather waiting for it synchronously each time. Thus the ability to >> > operate >> > asynchronously provides no benefit in this case, and this change is >> > purely about >> > allowing a particular driver to be used, presumably the s390 phmac one >> > from your >> > recent patchset. Since s390 phmac seems to be new code, and furthermore >> > it is >> > CPU-based and thus uses virtual addresses (which makes the use of >> > scatterlists >> > entirely pointless), wouldn't it be easier to just make it implement >> > shash >> > instead of ahash, moving any wait that may be necessary into the driver >> > itself? >> > >> > - Eric >> >> Thanks for this feedback. I'll give it a try with some performance >> measurements. >> And I totally agree that a synchronous implementation of phmac whould >> have >> solved >> this also. But maybe you can see that this is not an option according >> to >> Herbert Xu's feedback about my first posts with implementing phmac as >> an >> shash. >> The thing is that we have to derive a hardware based key (pkey) from >> the >> given key material and that may be a sleeping call which a shash must >> not >> invoke. >> So finally the phmac implementation is now an ahash digest >> implementation >> as suggested by Herbert. >> >> You are right, my patch is not really asynchronous. Or at least >> waiting for >> completion at the end of each function. However, opposed to the ahash >> invocation >> where there have been some update() calls this is now done in just one >> digest() >> giving the backing algorithm a chance to hash all this in one step >> (well it >> still >> needs to walk the scatterlist). >> >> Is there a way to have dm-integrity accept both, a ahash() or a >> shash() >> digest? >> > > To properly support async algorithms, the users (e.g. dm-integrity and > dm-verity) really would need to have separate code paths anyway. The > computation models are just too different. > > But in this case, it seems you simply want it to be synchronous and use > virtual > addresses. The quirks of ahash, including its need for per-request > allocations > and scatterlists, make it a poor match here. The only thing you are > getting > with it is, ironically, that it allows you to wait synchronously. That > could be > done with shash too if it was fixed to support algorithms that aren't > atomic. > E.g. there could be a new CRYPTO_ALG_MAY_SLEEP flag that could be set > in struct > shash_alg to indicate that the algorithm doesn't support atomic > context, and a > flag could be passed to crypto_alloc_shash() to allow such an algorithm > to be > selected (if the particular user never uses it in atomic context). > > That would be faster and simpler than the proposed ahash based version. > > - Eric Eric your idea was whirling around in my head for at least 3 month now. It would be great to have a way to offer an shash() implementation which clearly states that it is not for use in atomic context. E.g. like as you wrote with a new flag. I'll work out something in this direction and push it onto the crypto mailing list :-)
On Thu, Jan 16, 2025 at 10:00:36AM +0100, Harald Freudenberger wrote: > > Eric your idea was whirling around in my head for at least 3 month now. > It would be great to have a way to offer an shash() implementation which > clearly states that it is not for use in atomic context. E.g. like as you > wrote with a new flag. I'll work out something in this direction and push > it onto the crypto mailing list :-) I don't like this flag because we'd have to modify every other hash user. For example, this would result in your algorithm being unavailable to IPsec despite it being an ahash user (because it can't sleep). I think the dm-crypt patch should wait until after I have added virtual address support to ahash. Then we could compare it properly with shash. Thanks,
On Thu, Jan 16, 2025 at 05:12:57PM +0800, Herbert Xu wrote: > On Thu, Jan 16, 2025 at 10:00:36AM +0100, Harald Freudenberger wrote: > > > > Eric your idea was whirling around in my head for at least 3 month now. > > It would be great to have a way to offer an shash() implementation which > > clearly states that it is not for use in atomic context. E.g. like as you > > wrote with a new flag. I'll work out something in this direction and push > > it onto the crypto mailing list :-) > > I don't like this flag because we'd have to modify every other > hash user. But in practice it's the opposite. Making it an ahash forces users who would otherwise just be using shash to use ahash instead. > For example, this would result in your algorithm > being unavailable to IPsec despite it being an ahash user (because > it can't sleep). No one is asking for it to be available to IPsec. And this is just a MAC, not a cipher. But more generally, sleepable algorithms could still be used via an adapter using cryptd, or by making IPsec support using a workqueue like all the storage encryption/integrity systems already do. In any case this would be a rare (or nonexistent?) use case and should be treated as such. > I think the dm-crypt patch should wait until after I have added > virtual address support to ahash. Then we could compare it properly > with shash. That won't solve all the problems with ahash, for example the per-request allocation. We'll still end up with something that is worse for 99% of users, while also doing a very poor job supporting the 1% (even assuming it's 1% and not 0% which it very well might be) who actually think they want off-CPU hardware crypto acceleration. Not to mention the nonsense like having "synchronous asynchronous hashes". I think it's time to admit that the approach you're trying to take with the crypto API is wrong. This has been known for years. The primary API needs to be designed around and optimized for software crypto, which is what nearly everyone wants. - Eric
On Thu, Jan 16, 2025 at 05:54:51PM +0000, Eric Biggers wrote: > > But in practice it's the opposite. Making it an ahash forces users who would > otherwise just be using shash to use ahash instead. I think that's a good thing. shash was a mistake and I intend to get rid of it. But we should not do the ahash conversion right now. Let me finish adding virtual address support first and then we could reassess whether this makes sense or not. > No one is asking for it to be available to IPsec. And this is just a MAC, not a > cipher. But more generally, sleepable algorithms could still be used via an > adapter using cryptd, or by making IPsec support using a workqueue like all the > storage encryption/integrity systems already do. In any case this would be a > rare (or nonexistent?) use case and should be treated as such. If it was possible to shoehorn phmac/paes into a synchronous algorithm I'd love to do it. But the fact is that this requires asynchronous communication in the *unlikely* case which means that the best way to model is with an optional async interface. if (unlikely(crypto_ahash_op(...) == -EINPROGRESS)) do async else op completed synchronously > That won't solve all the problems with ahash, for example the per-request > allocation. We'll still end up with something that is worse for 99% of users, > while also doing a very poor job supporting the 1% (even assuming it's 1% and > not 0% which it very well might be) who actually think they want off-CPU > hardware crypto acceleration. Not to mention the nonsense like having > "synchronous asynchronous hashes". I disagree. I want the users to start feeding ahash with more than one unit of input. So rather than splitting up a folio into page-size or sector-size units, you feed the whole thing to ahash and the data should be split up automatically and hashed in parallel. > I think it's time to admit that the approach you're trying to take with the > crypto API is wrong. This has been known for years. The only thing I agree with you here is that an SG list-based input format is inappropriate. And that is why I'm adding virtual address support to ahash (and later to skcipher). > The primary API needs to be designed around and optimized for software crypto, > which is what nearly everyone wants. I agree with that completely. However, what triggered this thread is in fact something that is not purely software crypto because it involves system-wide communication which could take seconds to complete. The original patch tried to shoehorn this into a synchronous implementation that would just fail randomly after waiting. Cheers,
On Fri, Jan 17, 2025 at 02:21:19PM +0800, Herbert Xu wrote: > On Thu, Jan 16, 2025 at 05:54:51PM +0000, Eric Biggers wrote: > > > > But in practice it's the opposite. Making it an ahash forces users who would > > otherwise just be using shash to use ahash instead. > > I think that's a good thing. shash was a mistake and I intend to > get rid of it. I intend to keep using it, as it's a much better API. Even if virtual address support were to be added to ahash, shash will still be a simpler interface that does not require dynamically allocated per-request memory. > If it was possible to shoehorn phmac/paes into a synchronous > algorithm I'd love to do it. But the fact is that this requires > asynchronous communication in the *unlikely* case which means > that the best way to model is with an optional async interface. > > if (unlikely(crypto_ahash_op(...) == -EINPROGRESS)) > do async > else > op completed synchronously Of course it's possible for it to be a synchronous algorithm. The proposed user waits synchronously for the request to complete anyway. > > > That won't solve all the problems with ahash, for example the per-request > > allocation. We'll still end up with something that is worse for 99% of users, > > while also doing a very poor job supporting the 1% (even assuming it's 1% and > > not 0% which it very well might be) who actually think they want off-CPU > > hardware crypto acceleration. Not to mention the nonsense like having > > "synchronous asynchronous hashes". > > I disagree. I want the users to start feeding ahash with more > than one unit of input. So rather than splitting up a folio > into page-size or sector-size units, you feed the whole thing > to ahash and the data should be split up automatically and hashed > in parallel. That sort of fits the {dm,fs}-verity use case specifically, but it seems like a weird hack to avoid the simpler approach that I proposed, and it would introduce unnecessary restrictions like the input data would have to be a folio. > > The primary API needs to be designed around and optimized for software crypto, > > which is what nearly everyone wants. > > I agree with that completely. Yet you continue to push for APIs that are hard to use and slow because they are designed for an obsolete form of hardware offload and have software crypto support bolted on as an afterthought. > However, what triggered this thread is in fact something that is not purely > software crypto because it involves system-wide communication which could take > seconds to complete. Sure, but it's possible for it to use the same API. > The original patch tried to shoehorn this into a synchronous implementation > that would just fail randomly after waiting. The async version has that same timeout, so the effect is exactly the same; the hash operation can fail after a few seconds of waiting. - Eric
On 2025-01-15 18:29, Mikulas Patocka wrote: > Hi > > The ahash interface is slower than the shash interface for synchronous > implementations, so the patch is basically slowing down the common > case. > See the upstream commit b76ad8844234bd0d394105d7d784cd05f1bf269a for an > explanation in dm-verity. > > Do you have some benchmark that shows how much does it help on s390x? > So, > that we can evaluate whether the added complexity is worth the > performance > improvement or not. > > Mikulas > Just a short sniff test on my development system (s390x with 32GB RAM, 16 CPUs). When I run a dm-integrity format command on a 16G loopback file backed up by tempfs I get this on a fedora40 with fresh build v6.13-rc7 kernel: integritysetup format /dev/loop0 --integrity sha256 --sector-size 4096 ... Finished, time 00m08s, 15 GiB written, speed 1.8 GiB/s and with the same kernel + my dm-integrity async patch: Finished, time 00m09s, 15 GiB written, speed 1.7 GiB/s However, I'll run some more sophisticated performance measurements during the weekend. Harald Freudenberger
On 2025-01-15 18:29, Mikulas Patocka wrote: > Hi > > The ahash interface is slower than the shash interface for synchronous > implementations, so the patch is basically slowing down the common > case. > See the upstream commit b76ad8844234bd0d394105d7d784cd05f1bf269a for an > explanation in dm-verity. > > Do you have some benchmark that shows how much does it help on s390x? > So, > that we can evaluate whether the added complexity is worth the > performance > improvement or not. > > Mikulas > > ... Hi Mikulas, So finally some benchmarks measured on my development system: A LPAR on a z16 with 16 CPUs, 32G memory, with a fresh build linux 6.13.0-rc7 kernel with and without just my dm-integrity ahash patch. For the dm-integrity format measurements I used a 16G file located in tempfs as the backing file for a loopback device. The backing file totally written with random data from /dev/urandom. The dm-integrity format command was integritysetup format /dev/loop0 --integrity <alg> --sector-size 4096 6.13.0-rc7 with dm-integrity using shash: sha256 Finished, time 00m09s, 15 GiB written, speed 1.8 GiB/s hmac-sha256 Finished, time 00m09s, 15 GiB written, speed 1.7 GiB/s 6.13.0-rc7 with dm-integrity with my ahash patch: sha256 Finished, time 00m09s, 15 GiB written, speed 1.7 GiB/s hmac-sha256 Finished, time 00m09s, 15 GiB written, speed 1.6 GiB/s In practice the read and write performance may be of more importance. I set up a 8G file located in tempfs as the backing file for a loopback device and dm-integrity formatted and opened it. Then I created a random file with 4G via dd if=/dev/urandom which was located in tempfs. For the write I used dd if=<myrandomfile> of=/dev/mapper/<dm-inintegrity-name> oflag=direct,sync bs=4096 count=1M to copy the 4G random into the dm-crypt-block device. For the read I used dd if=/dev/mapper/<dm-inintegrity-name> of=/dev/null iflag=direct,sync bs=4096 count=1M to copy 4G from the dm-crypt-block device to /dev/null. 6.13.0-rc7 with dm-integrity using shash: sha256 WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 45.5 s, 94.4 MB/s READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 19.2137 s, 224 MB/s hmac-sha256 WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 45.2026 s, 95.0 MB/s READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 19.2082 s, 224 MB/s 6.13.0-rc7 with dm-integrity with my ahash patch: sha256 WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 41.5273 s, 103 MB/s READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 16.2558 s, 264 MB/s hmac-sha256 WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 44.063 s, 97.5 MB/s READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 16.5381 s, 260 MB/s I checked these results several times. They vary but always the dm-integrity with the ahash patch gives the better figures. I ran some measurements with an isolated cpu and used this cpu to pin the format or the dd task to this cpu. Pinning is not a good idea as very much of the work is done via workqueues in dm-integrity and so the communication overhead between the cpus increases. However, I would have expected a slight penalty with the ahash patch like it is to see with the dm-integrity format but read and write seem to benefit from this simple ahash patch. It would be very interesting how a real asynch implementation of dm-integrity really performs. If someone is interested, I can share my scripts for these measurements. Harald Freudenberger
Hi On Wed, 22 Jan 2025, Harald Freudenberger wrote: > > 6.13.0-rc7 with dm-integrity using shash: > > sha256 > WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 45.5 s, 94.4 MB/s > READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 19.2137 s, 224 MB/s > hmac-sha256 > WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 45.2026 s, 95.0 MB/s > READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 19.2082 s, 224 MB/s > > 6.13.0-rc7 with dm-integrity with my ahash patch: > > sha256 > WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 41.5273 s, 103 MB/s > READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 16.2558 s, 264 MB/s > hmac-sha256 > WRITE: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 44.063 s, 97.5 MB/s > READ: 4294967296 bytes (4.3 GB, 4.0 GiB) copied, 16.5381 s, 260 MB/s > > I checked these results several times. They vary but always the dm-integrity > with the ahash patch gives the better figures. I ran some measurements with > an isolated cpu and used this cpu to pin the format or the dd task to this > cpu. Pinning is not a good idea as very much of the work is done via > workqueues > in dm-integrity and so the communication overhead between the cpus increases. > However, I would have expected a slight penalty with the ahash patch like > it is to see with the dm-integrity format but read and write seem to benefit > from this simple ahash patch. It would be very interesting how a real asynch > implementation of dm-integrity really performs. > > If someone is interested, I can share my scripts for these measurements. > > Harald Freudenberger If ahash helps for you, you can add it in a similar way to the dm-verity target - so that it is only used when it makes sense - see the function verity_setup_hash_alg. Mikulas
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e..1504db9276d1 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -195,7 +195,7 @@ struct dm_integrity_c { struct scatterlist **journal_io_scatterlist; struct skcipher_request **sk_requests; - struct crypto_shash *journal_mac; + struct crypto_ahash *journal_mac; struct journal_node *journal_tree; struct rb_root journal_tree_root; @@ -221,7 +221,7 @@ struct dm_integrity_c { int failed; - struct crypto_shash *internal_hash; + struct crypto_ahash *internal_hash; struct dm_target *ti; @@ -488,11 +488,14 @@ static void sb_set_version(struct dm_integrity_c *ic) static int sb_mac(struct dm_integrity_c *ic, bool wr) { - SHASH_DESC_ON_STACK(desc, ic->journal_mac); - int r; - unsigned int mac_size = crypto_shash_digestsize(ic->journal_mac); + unsigned int mac_size = crypto_ahash_digestsize(ic->journal_mac); __u8 *sb = (__u8 *)ic->sb; __u8 *mac = sb + (1 << SECTOR_SHIFT) - mac_size; + struct ahash_request *req; + DECLARE_CRYPTO_WAIT(wait); + struct scatterlist sg; + unsigned int nbytes; + int r; if (sizeof(struct superblock) + mac_size > 1 << SECTOR_SHIFT || mac_size > HASH_MAX_DIGESTSIZE) { @@ -500,29 +503,44 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) return -EINVAL; } - desc->tfm = ic->journal_mac; + req = ahash_request_alloc(ic->journal_mac, GFP_KERNEL); + if (unlikely(!req)) { + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); + return -ENOMEM; + } + ahash_request_set_callback(req, 0, crypto_req_done, &wait); + + sg_init_table(&sg, 1); + nbytes = mac - sb; + sg_set_buf(&sg, sb, nbytes); if (likely(wr)) { - r = crypto_shash_digest(desc, sb, mac - sb, mac); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_digest", r); + ahash_request_set_crypt(req, &sg, mac, nbytes); + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); + ahash_request_free(req); return r; } } else { __u8 actual_mac[HASH_MAX_DIGESTSIZE]; - r = crypto_shash_digest(desc, sb, mac - sb, actual_mac); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_digest", r); + ahash_request_set_crypt(req, &sg, actual_mac, nbytes); + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); + ahash_request_free(req); return r; } if (memcmp(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); + ahash_request_free(req); return -EILSEQ; } } + ahash_request_free(req); return 0; } @@ -775,51 +793,62 @@ static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, uns static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 result[JOURNAL_MAC_SIZE]) { - SHASH_DESC_ON_STACK(desc, ic->journal_mac); + unsigned int j, size, nsg, nbytes = 0; + struct scatterlist *sg = NULL, *s; + struct ahash_request *req = NULL; + DECLARE_CRYPTO_WAIT(wait); + __le64 *section_le = NULL; int r; - unsigned int j, size; - desc->tfm = ic->journal_mac; + req = ahash_request_alloc(ic->journal_mac, GFP_KERNEL); + if (unlikely(!req)) { + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); + goto err; + } + ahash_request_set_callback(req, 0, crypto_req_done, &wait); - r = crypto_shash_init(desc); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_init", r); + nsg = ic->journal_section_entries; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + nsg += 2; + sg = kmalloc_array(nsg, sizeof(*sg), GFP_KERNEL); + if (unlikely(!sg)) { + dm_integrity_io_error(ic, "kmalloc_array", -ENOMEM); goto err; } + sg_init_table(sg, nsg); + s = sg; if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { - __le64 section_le; - - r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); - goto err; - } + sg_set_buf(s, (__u8 *)&ic->sb->salt, SALT_SIZE); + nbytes += SALT_SIZE; + s++; - section_le = cpu_to_le64(section); - r = crypto_shash_update(desc, (__u8 *)§ion_le, sizeof(section_le)); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); + section_le = kmalloc(sizeof(__le64), GFP_KERNEL); + if (unlikely(!section_le)) { + dm_integrity_io_error(ic, "kmalloc(sizeof(__le64))", -ENOMEM); goto err; } + *section_le = cpu_to_le64(section); + sg_set_buf(s, (__u8 *)section_le, sizeof(*section_le)); + nbytes += sizeof(*section_le); + s++; } for (j = 0; j < ic->journal_section_entries; j++) { struct journal_entry *je = access_journal_entry(ic, section, j); - r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof(je->u.sector)); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); - goto err; - } + sg_set_buf(s, (__u8 *)&je->u.sector, sizeof(je->u.sector)); + nbytes += sizeof(je->u.sector); + s++; } - size = crypto_shash_digestsize(ic->journal_mac); + size = crypto_ahash_digestsize(ic->journal_mac); if (likely(size <= JOURNAL_MAC_SIZE)) { - r = crypto_shash_final(desc, result); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_final", r); + ahash_request_set_crypt(req, sg, result, nbytes); + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); goto err; } memset(result + size, 0, JOURNAL_MAC_SIZE - size); @@ -830,16 +859,24 @@ static void section_mac(struct dm_integrity_c *ic, unsigned int section, __u8 re dm_integrity_io_error(ic, "digest_size", -EINVAL); goto err; } - r = crypto_shash_final(desc, digest); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_final", r); + ahash_request_set_crypt(req, sg, digest, nbytes); + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); goto err; } memcpy(result, digest, JOURNAL_MAC_SIZE); } + ahash_request_free(req); + kfree(section_le); + kfree(sg); return; + err: + ahash_request_free(req); + kfree(section_le); + kfree(sg); memset(result, 0, JOURNAL_MAC_SIZE); } @@ -1637,53 +1674,65 @@ static void integrity_end_io(struct bio *bio) static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector, const char *data, char *result) { - __le64 sector_le = cpu_to_le64(sector); - SHASH_DESC_ON_STACK(req, ic->internal_hash); - int r; + struct ahash_request *req = NULL; + struct scatterlist sg[3], *s; + DECLARE_CRYPTO_WAIT(wait); + __le64 *sector_le = NULL; unsigned int digest_size; + unsigned int nbytes = 0; + int r; - req->tfm = ic->internal_hash; - - r = crypto_shash_init(req); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_init", r); + req = ahash_request_alloc(ic->internal_hash, GFP_KERNEL); + if (unlikely(!req)) { + dm_integrity_io_error(ic, "ahash_request_alloc", -ENOMEM); goto failed; } + ahash_request_set_callback(req, 0, crypto_req_done, &wait); + s = sg; if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { - r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); - goto failed; - } + sg_init_table(sg, 3); + sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE); + nbytes += SALT_SIZE; + s++; + } else { + sg_init_table(sg, 2); } - r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof(sector_le)); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); + sector_le = kmalloc(sizeof(__le64), GFP_KERNEL); + if (unlikely(!sector_le)) { + dm_integrity_io_error(ic, "kmalloc(sizeof(__le64))", -ENOMEM); goto failed; } + *sector_le = cpu_to_le64(sector); + sg_set_buf(s, (const __u8 *)sector_le, sizeof(*sector_le)); + nbytes += sizeof(*sector_le); + s++; - r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_update", r); - goto failed; - } + sg_set_buf(s, data, ic->sectors_per_block << SECTOR_SHIFT); + nbytes += ic->sectors_per_block << SECTOR_SHIFT; + + ahash_request_set_crypt(req, sg, result, nbytes); - r = crypto_shash_final(req, result); - if (unlikely(r < 0)) { - dm_integrity_io_error(ic, "crypto_shash_final", r); + r = crypto_wait_req(crypto_ahash_digest(req), &wait); + if (r) { + dm_integrity_io_error(ic, "crypto_ahash_digest", r); goto failed; } - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = crypto_ahash_digestsize(ic->internal_hash); if (unlikely(digest_size < ic->tag_size)) memset(result + digest_size, 0, ic->tag_size - digest_size); + ahash_request_free(req); + kfree(sector_le); + return; failed: /* this shouldn't happen anyway, the hash functions have no reason to fail */ + ahash_request_free(req); + kfree(sector_le); get_random_bytes(result, ic->tag_size); } @@ -1776,7 +1825,7 @@ static void integrity_metadata(struct work_struct *w) if (ic->internal_hash) { struct bvec_iter iter; struct bio_vec bv; - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = crypto_ahash_digestsize(ic->internal_hash); struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; @@ -2124,7 +2173,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, } while (++s < ic->sectors_per_block); if (ic->internal_hash) { - unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash); + unsigned int digest_size = crypto_ahash_digestsize(ic->internal_hash); if (unlikely(digest_size > ic->tag_size)) { char checksums_onstack[HASH_MAX_DIGESTSIZE]; @@ -2428,7 +2477,7 @@ static int dm_integrity_map_inline(struct dm_integrity_io *dio, bool from_map) if (!dio->integrity_payload) { unsigned digest_size, extra_size; dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block); - digest_size = crypto_shash_digestsize(ic->internal_hash); + digest_size = crypto_ahash_digestsize(ic->internal_hash); extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; dio->payload_len += extra_size; dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -2595,7 +2644,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_put(outgoing_bio); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_ahash_digestsize(ic->internal_hash), ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2635,7 +2684,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); if (unlikely(memcmp(digest, dio->integrity_payload + pos, - min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + min(crypto_ahash_digestsize(ic->internal_hash), ic->tag_size)))) { kunmap_local(mem); dm_integrity_free_payload(dio); INIT_WORK(&dio->work, dm_integrity_inline_recheck); @@ -3017,8 +3066,8 @@ static void integrity_recalc(struct work_struct *w) goto free_ret; } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + if (crypto_ahash_digestsize(ic->internal_hash) > ic->tag_size) + recalc_tags_size += crypto_ahash_digestsize(ic->internal_hash) - ic->tag_size; recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); if (!recalc_tags) { vfree(recalc_buffer); @@ -3177,8 +3226,8 @@ static void integrity_recalc_inline(struct work_struct *w) } recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size; + if (crypto_ahash_digestsize(ic->internal_hash) > ic->tuple_size) + recalc_tags_size += crypto_ahash_digestsize(ic->internal_hash) - ic->tuple_size; recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN); if (!recalc_tags) { kfree(recalc_buffer); @@ -4187,6 +4236,8 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch if (!a->alg_string) goto nomem; + DEBUG_print("%s: alg_string=%s\n", __func__, a->alg_string); + k = strchr(a->alg_string, ':'); if (k) { *k = 0; @@ -4198,6 +4249,9 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch a->key = kmalloc(a->key_size, GFP_KERNEL); if (!a->key) goto nomem; + + DEBUG_print("%s: key=%s\n", __func__, a->key_string); + if (hex2bin(a->key, a->key_string, a->key_size)) goto inval; } @@ -4211,13 +4265,15 @@ static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, ch return -ENOMEM; } -static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, +static int get_mac(struct crypto_ahash **hash, struct alg_spec *a, char **error, char *error_alg, char *error_key) { int r; + DEBUG_print(">%s\n", __func__); + if (a->alg_string) { - *hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); + *hash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY); if (IS_ERR(*hash)) { *error = error_alg; r = PTR_ERR(*hash); @@ -4226,12 +4282,12 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, } if (a->key) { - r = crypto_shash_setkey(*hash, a->key, a->key_size); + r = crypto_ahash_setkey(*hash, a->key, a->key_size); if (r) { *error = error_key; return r; } - } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { + } else if (crypto_ahash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { *error = error_key; return -ENOKEY; } @@ -4707,7 +4763,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv r = -EINVAL; goto bad; } - ic->tag_size = crypto_shash_digestsize(ic->internal_hash); + ic->tag_size = crypto_ahash_digestsize(ic->internal_hash); } if (ic->tag_size > MAX_TAG_SIZE) { ti->error = "Too big tag size"; @@ -5226,7 +5282,7 @@ static void dm_integrity_dtr(struct dm_target *ti) free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT); if (ic->internal_hash) - crypto_free_shash(ic->internal_hash); + crypto_free_ahash(ic->internal_hash); free_alg(&ic->internal_hash_alg); if (ic->journal_crypt) @@ -5234,7 +5290,7 @@ static void dm_integrity_dtr(struct dm_target *ti) free_alg(&ic->journal_crypt_alg); if (ic->journal_mac) - crypto_free_shash(ic->journal_mac); + crypto_free_ahash(ic->journal_mac); free_alg(&ic->journal_mac_alg); kfree(ic);
Use the async digest in-kernel crypto API instead of the synchronous digest API. This has the advantage of being able to use synchronous as well as asynchronous digest implementations as the in-kernel API has an automatic wrapping mechanism to provide all synchronous digests via the asynch API. Tested with crc32, sha256, hmac-sha256 and the s390 specific implementations for hmac-sha256 and protected key phmac-sha256. Signed-off-by: Harald Freudenberger <freude@linux.ibm.com> --- drivers/md/dm-integrity.c | 220 ++++++++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 82 deletions(-)