diff mbox series

[v3,6/8] fsverity: improve performance by using multibuffer hashing

Message ID 20240507002343.239552-7-ebiggers@kernel.org (mailing list archive)
State New, archived
Headers show
Series Optimize dm-verity and fsverity using multibuffer hashing | expand

Commit Message

Eric Biggers May 7, 2024, 12:23 a.m. UTC
From: Eric Biggers <ebiggers@google.com>

When supported by the hash algorithm, use crypto_shash_finup_mb() to
interleave the hashing of pairs of data blocks.  On some CPUs this
nearly doubles hashing performance.  The increase in overall throughput
of cold-cache fsverity reads that I'm seeing on arm64 and x86_64 is
roughly 35% (though this metric is hard to measure as it jumps around a
lot).

For now this is only done on the verification path, and only for data
blocks, not Merkle tree blocks.  We could use finup_mb on Merkle tree
blocks too, but that is less important as there aren't as many Merkle
tree blocks as data blocks, and that would require some additional code
restructuring.  We could also use finup_mb to accelerate building the
Merkle tree, but verification performance is more important.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/verity/fsverity_private.h |   5 +
 fs/verity/hash_algs.c        |  32 ++++++-
 fs/verity/open.c             |   6 ++
 fs/verity/verify.c           | 177 +++++++++++++++++++++++++++++------
 4 files changed, 187 insertions(+), 33 deletions(-)

Comments

Herbert Xu May 31, 2024, 4:50 a.m. UTC | #1
Eric Biggers <ebiggers@kernel.org> wrote:
>
> +               if (multibuffer) {
> +                       if (ctx->pending_data) {
> +                               /* Hash and verify two data blocks. */
> +                               err = fsverity_hash_2_blocks(params,
> +                                                            inode,
> +                                                            ctx->pending_data,
> +                                                            data,
> +                                                            ctx->hash1,
> +                                                            ctx->hash2);
> +                               kunmap_local(data);
> +                               kunmap_local(ctx->pending_data);
> +                               ctx->pending_data = NULL;
> +                               if (err != 0 ||
> +                                   !verify_data_block(inode, vi, ctx->hash1,
> +                                                      ctx->pending_pos,
> +                                                      ctx->max_ra_pages) ||
> +                                   !verify_data_block(inode, vi, ctx->hash2,
> +                                                      pos, ctx->max_ra_pages))
> +                                       return false;
> +                       } else {
> +                               /* Wait and see if there's another block. */
> +                               ctx->pending_data = data;
> +                               ctx->pending_pos = pos;
> +                       }
> +               } else {
> +                       /* Hash and verify one data block. */
> +                       err = fsverity_hash_block(params, inode, data,
> +                                                 ctx->hash1);
> +                       kunmap_local(data);
> +                       if (err != 0 ||
> +                           !verify_data_block(inode, vi, ctx->hash1,
> +                                              pos, ctx->max_ra_pages))
> +                               return false;
> +               }
> +               pos += block_size;

I think this complexity is gross.  Look at how we did GSO in
networking.  There should be a unified code-path for aggregated
data and simple data, not an aggregated path versus a simple path.

I think ultimately it stems from the fact that this code went from
ahash to shash.  What were the issues back then? If it's just vmalloc
we should fix ahash to support that, rather than making users of the
Crypto API go through contortions like this.

Cheers,
Eric Biggers May 31, 2024, 6:13 a.m. UTC | #2
On Fri, May 31, 2024 at 12:50:20PM +0800, Herbert Xu wrote:
> Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > +               if (multibuffer) {
> > +                       if (ctx->pending_data) {
> > +                               /* Hash and verify two data blocks. */
> > +                               err = fsverity_hash_2_blocks(params,
> > +                                                            inode,
> > +                                                            ctx->pending_data,
> > +                                                            data,
> > +                                                            ctx->hash1,
> > +                                                            ctx->hash2);
> > +                               kunmap_local(data);
> > +                               kunmap_local(ctx->pending_data);
> > +                               ctx->pending_data = NULL;
> > +                               if (err != 0 ||
> > +                                   !verify_data_block(inode, vi, ctx->hash1,
> > +                                                      ctx->pending_pos,
> > +                                                      ctx->max_ra_pages) ||
> > +                                   !verify_data_block(inode, vi, ctx->hash2,
> > +                                                      pos, ctx->max_ra_pages))
> > +                                       return false;
> > +                       } else {
> > +                               /* Wait and see if there's another block. */
> > +                               ctx->pending_data = data;
> > +                               ctx->pending_pos = pos;
> > +                       }
> > +               } else {
> > +                       /* Hash and verify one data block. */
> > +                       err = fsverity_hash_block(params, inode, data,
> > +                                                 ctx->hash1);
> > +                       kunmap_local(data);
> > +                       if (err != 0 ||
> > +                           !verify_data_block(inode, vi, ctx->hash1,
> > +                                              pos, ctx->max_ra_pages))
> > +                               return false;
> > +               }
> > +               pos += block_size;
> 
> I think this complexity is gross.  Look at how we did GSO in
> networking.  There should be a unified code-path for aggregated
> data and simple data, not an aggregated path versus a simple path.
> 
> I think ultimately it stems from the fact that this code went from
> ahash to shash.  What were the issues back then? If it's just vmalloc
> we should fix ahash to support that, rather than making users of the
> Crypto API go through contortions like this.

It can't be asynchronous, period.  As I've explained, that would be far too
complex, and it would also defeat the purpose because it would make performance
worse.  Messages *must* be queued up and hashed in the caller's context.

What could make sense would be some helper functions and an associated struct
for queueing up messages for a particular crypto_shash, up to its mb_max_msgs
value, and then flushing them and retrieving the digests.  These would be
provided by the crypto API.

I think this would address your concern, in that the users (fsverity and
dm-verity) would have a unified code path for multiple vs. single blocks.

I didn't think it would be worthwhile to go there yet, given that fsverity and
dm-verity just want 2x or 1x, and it ends up being simpler and more efficient to
handle those cases directly.  But we could go with the more general queueing
helper functions instead if you feel they should be included from the start.

- Eric
Eric Biggers May 31, 2024, 6:52 a.m. UTC | #3
On Thu, May 30, 2024 at 11:13:48PM -0700, Eric Biggers wrote:
> On Fri, May 31, 2024 at 12:50:20PM +0800, Herbert Xu wrote:
> > Eric Biggers <ebiggers@kernel.org> wrote:
> > >
> > > +               if (multibuffer) {
> > > +                       if (ctx->pending_data) {
> > > +                               /* Hash and verify two data blocks. */
> > > +                               err = fsverity_hash_2_blocks(params,
> > > +                                                            inode,
> > > +                                                            ctx->pending_data,
> > > +                                                            data,
> > > +                                                            ctx->hash1,
> > > +                                                            ctx->hash2);
> > > +                               kunmap_local(data);
> > > +                               kunmap_local(ctx->pending_data);
> > > +                               ctx->pending_data = NULL;
> > > +                               if (err != 0 ||
> > > +                                   !verify_data_block(inode, vi, ctx->hash1,
> > > +                                                      ctx->pending_pos,
> > > +                                                      ctx->max_ra_pages) ||
> > > +                                   !verify_data_block(inode, vi, ctx->hash2,
> > > +                                                      pos, ctx->max_ra_pages))
> > > +                                       return false;
> > > +                       } else {
> > > +                               /* Wait and see if there's another block. */
> > > +                               ctx->pending_data = data;
> > > +                               ctx->pending_pos = pos;
> > > +                       }
> > > +               } else {
> > > +                       /* Hash and verify one data block. */
> > > +                       err = fsverity_hash_block(params, inode, data,
> > > +                                                 ctx->hash1);
> > > +                       kunmap_local(data);
> > > +                       if (err != 0 ||
> > > +                           !verify_data_block(inode, vi, ctx->hash1,
> > > +                                              pos, ctx->max_ra_pages))
> > > +                               return false;
> > > +               }
> > > +               pos += block_size;
> > 
> > I think this complexity is gross.  Look at how we did GSO in
> > networking.  There should be a unified code-path for aggregated
> > data and simple data, not an aggregated path versus a simple path.
> > 
> > I think ultimately it stems from the fact that this code went from
> > ahash to shash.  What were the issues back then? If it's just vmalloc
> > we should fix ahash to support that, rather than making users of the
> > Crypto API go through contortions like this.
> 
> It can't be asynchronous, period.  As I've explained, that would be far too
> complex, and it would also defeat the purpose because it would make performance
> worse.  Messages *must* be queued up and hashed in the caller's context.
> 
> What could make sense would be some helper functions and an associated struct
> for queueing up messages for a particular crypto_shash, up to its mb_max_msgs
> value, and then flushing them and retrieving the digests.  These would be
> provided by the crypto API.
> 
> I think this would address your concern, in that the users (fsverity and
> dm-verity) would have a unified code path for multiple vs. single blocks.
> 
> I didn't think it would be worthwhile to go there yet, given that fsverity and
> dm-verity just want 2x or 1x, and it ends up being simpler and more efficient to
> handle those cases directly.  But we could go with the more general queueing
> helper functions instead if you feel they should be included from the start.
> 

Looking at it again a bit more closely, both fsverity and dm-verity have
per-block information that they need to keep track of in the queue in addition
to the data buffers and hashes: the block number, and in dm-verity's case also a
bvec_iter pointing to that block.

So I think it really does make sense to have them both handle the queueing
themselves, and not have it split between them and some crypto API helper
functions (i.e. two queues that mirror each other).

It would be possible, though, to organize the code in dm-verity and fsverity to
represent the queue as an array and operate on it as such.  That would also
address your concern about the two code paths.  Again, things would end up being
a bit less efficient than my more optimized code that handles 1x and 2x (which
is all that's actually needed for now) specifically, but it would work, I think.

- Eric
Herbert Xu May 31, 2024, 8:38 a.m. UTC | #4
On Thu, May 30, 2024 at 11:13:48PM -0700, Eric Biggers wrote:
>
> It can't be asynchronous, period.  As I've explained, that would be far too
> complex, and it would also defeat the purpose because it would make performance
> worse.  Messages *must* be queued up and hashed in the caller's context.

When I say ahash I don't necessarily mean async.  I'm just referring
to the SG-list based API that is known as ahash.  As this user
appears to be doing kmap, it would appear that ahash is a better
fit rather than shash.

Cheers,
Herbert Xu May 31, 2024, 8:49 a.m. UTC | #5
On Thu, May 30, 2024 at 11:52:58PM -0700, Eric Biggers wrote:
>
> Looking at it again a bit more closely, both fsverity and dm-verity have
> per-block information that they need to keep track of in the queue in addition
> to the data buffers and hashes: the block number, and in dm-verity's case also a
> bvec_iter pointing to that block.

Again I'm not asking you to make this API asynchronous at all.

I was just commenting on the added complexity in fsverify due to
the use of the linear shash API instead of the page-based ahash API.

This complexity was then compounded by the multi-buffer support.

I think this would look a lot simpler if it moved back to ahash.

The original commit mentioned that ahash was bad for fsverify
because of vmalloc.  But the only use of linear pointers in fsverify
seems to be from kmalloc.  Where is the vmalloc coming from?

Cheers,
Eric Biggers May 31, 2024, 6:51 p.m. UTC | #6
On Fri, May 31, 2024 at 04:49:28PM +0800, Herbert Xu wrote:
> On Thu, May 30, 2024 at 11:52:58PM -0700, Eric Biggers wrote:
> >
> > Looking at it again a bit more closely, both fsverity and dm-verity have
> > per-block information that they need to keep track of in the queue in addition
> > to the data buffers and hashes: the block number, and in dm-verity's case also a
> > bvec_iter pointing to that block.
> 
> Again I'm not asking you to make this API asynchronous at all.

What exactly are you suggesting, then?  It seems that you want multibuffer
hashing to be supported by the existing ahash API.  However, that only works if
it's made asynchronous, given how the messages would have to be queued up on a
global queue.  That has a huge number of issues which I've already explained.
(And it was even tried before, and it failed.)

> I was just commenting on the added complexity in fsverify due to
> the use of the linear shash API instead of the page-based ahash API.

It's the other way around.  The shash version is much simpler.  Just look at the
diff of commit 8fcd94add6c5 that changed from ahash to shash:

    4 files changed, 71 insertions(+), 200 deletions(-)

> This complexity was then compounded by the multi-buffer support.

fsverity and dm-verity will have to be updated to use multibuffer hashing
anyway, given that transparently supporting it in the existing API is not
viable.  If your concern is that in my current patchset fsverity and dm-verity
have separate code paths for multibuffer vs. single-buffer, as I mentioned I can
address that by restructuring them to operate on arrays (similar to what I
already did with the crypto API part).

> I think this would look a lot simpler if it moved back to ahash.
> 
> The original commit mentioned that ahash was bad for fsverify
> because of vmalloc.  But the only use of linear pointers in fsverify
> seems to be from kmalloc.  Where is the vmalloc coming from?

XFS is working on adding support for fsverity, and XFS was planning to provide
their Merkle tree blocks in vmalloced buffers.  Their plans have shifted several
times, and I think their latest plan no longer uses vmalloced buffers.  But in
any case it's still very convenient and much simpler to be able to just use
virtual addresses without having to worry about what type of memory it is.

- Eric
Eric Biggers June 3, 2024, 6:50 p.m. UTC | #7
On Fri, May 31, 2024 at 11:51:26AM -0700, Eric Biggers wrote:
> On Fri, May 31, 2024 at 04:49:28PM +0800, Herbert Xu wrote:
> > On Thu, May 30, 2024 at 11:52:58PM -0700, Eric Biggers wrote:
> > >
> > > Looking at it again a bit more closely, both fsverity and dm-verity have
> > > per-block information that they need to keep track of in the queue in addition
> > > to the data buffers and hashes: the block number, and in dm-verity's case also a
> > > bvec_iter pointing to that block.
> > 
> > Again I'm not asking you to make this API asynchronous at all.
> 
> What exactly are you suggesting, then?  It seems that you want multibuffer
> hashing to be supported by the existing ahash API.  However, that only works if
> it's made asynchronous, given how the messages would have to be queued up on a
> global queue.  That has a huge number of issues which I've already explained.
> (And it was even tried before, and it failed.)
> 
> > I was just commenting on the added complexity in fsverify due to
> > the use of the linear shash API instead of the page-based ahash API.
> 
> It's the other way around.  The shash version is much simpler.  Just look at the
> diff of commit 8fcd94add6c5 that changed from ahash to shash:
> 
>     4 files changed, 71 insertions(+), 200 deletions(-)
> 
> > This complexity was then compounded by the multi-buffer support.
> 
> fsverity and dm-verity will have to be updated to use multibuffer hashing
> anyway, given that transparently supporting it in the existing API is not
> viable.  If your concern is that in my current patchset fsverity and dm-verity
> have separate code paths for multibuffer vs. single-buffer, as I mentioned I can
> address that by restructuring them to operate on arrays (similar to what I
> already did with the crypto API part).
> 
> > I think this would look a lot simpler if it moved back to ahash.
> > 
> > The original commit mentioned that ahash was bad for fsverify
> > because of vmalloc.  But the only use of linear pointers in fsverify
> > seems to be from kmalloc.  Where is the vmalloc coming from?
> 
> XFS is working on adding support for fsverity, and XFS was planning to provide
> their Merkle tree blocks in vmalloced buffers.  Their plans have shifted several
> times, and I think their latest plan no longer uses vmalloced buffers.  But in
> any case it's still very convenient and much simpler to be able to just use
> virtual addresses without having to worry about what type of memory it is.
> 

I've now unified the code paths for single-block and multi-block processing in
fsverity and dm-verity, which I think addresses your remaining concern that's
feasible to address.  Your feedback has been conflating different issues, and it
only comes in once every few weeks, so it's a bit hard to follow.  But I think
the remaining thing that you're asking for is to make the API to be part of
"ahash" and use the existing async completion callback.  That's not actually
feasible, for the reasons that I've explained in detail.  See in particular
https://lore.kernel.org/linux-crypto/20240503152810.GA1132@sol.localdomain/.

Thanks,

- Eric
Herbert Xu June 4, 2024, 9:39 a.m. UTC | #8
On Mon, Jun 03, 2024 at 11:50:02AM -0700, Eric Biggers wrote:
>
> I've now unified the code paths for single-block and multi-block processing in
> fsverity and dm-verity, which I think addresses your remaining concern that's
> feasible to address.  Your feedback has been conflating different issues, and it
> only comes in once every few weeks, so it's a bit hard to follow.  But I think
> the remaining thing that you're asking for is to make the API to be part of
> "ahash" and use the existing async completion callback.  That's not actually
> feasible, for the reasons that I've explained in detail.  See in particular
> https://lore.kernel.org/linux-crypto/20240503152810.GA1132@sol.localdomain/.

I've tried to dedicate more time to your patches, but sending
them in just before a merge window is a sure way to add extra
delay before I get a chance to review them.

Cheers,
diff mbox series

Patch

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index b3506f56e180..9fe1633f15d6 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -27,10 +27,11 @@  struct fsverity_hash_alg {
 	/*
 	 * The HASH_ALGO_* constant for this algorithm.  This is different from
 	 * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
 	 */
 	enum hash_algo algo_id;
+	bool supports_multibuffer;  /* crypto_shash_mb_max_msgs(tfm) >= 2 */
 };
 
 /* Merkle tree parameters: hash algorithm, initial hash state, and topology */
 struct merkle_tree_params {
 	const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
@@ -65,10 +66,11 @@  struct merkle_tree_params {
  */
 struct fsverity_info {
 	struct merkle_tree_params tree_params;
 	u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 zero_block_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	const struct inode *inode;
 	unsigned long *hash_block_verified;
 };
 
 #define FS_VERITY_MAX_SIGNATURE_SIZE	(FS_VERITY_MAX_DESCRIPTOR_SIZE - \
@@ -82,10 +84,13 @@  const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
 						      unsigned int num);
 const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 				      const u8 *salt, size_t salt_size);
 int fsverity_hash_block(const struct merkle_tree_params *params,
 			const struct inode *inode, const void *data, u8 *out);
+int fsverity_hash_2_blocks(const struct merkle_tree_params *params,
+			   const struct inode *inode, const void *data1,
+			   const void *data2, u8 *out1, u8 *out2);
 int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
 			 const void *data, size_t size, u8 *out);
 void __init fsverity_check_hash_algs(void);
 
 /* init.c */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 6b08b1d9a7d7..71b6f74aaacd 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -82,12 +82,15 @@  const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
 	if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm)))
 		goto err_free_tfm;
 	if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm)))
 		goto err_free_tfm;
 
-	pr_info("%s using implementation \"%s\"\n",
-		alg->name, crypto_shash_driver_name(tfm));
+	alg->supports_multibuffer = (crypto_shash_mb_max_msgs(tfm) >= 2);
+
+	pr_info("%s using implementation \"%s\"%s\n",
+		alg->name, crypto_shash_driver_name(tfm),
+		alg->supports_multibuffer ? " (multibuffer)" : "");
 
 	/* pairs with smp_load_acquire() above */
 	smp_store_release(&alg->tfm, tfm);
 	goto out_unlock;
 
@@ -195,10 +198,35 @@  int fsverity_hash_block(const struct merkle_tree_params *params,
 	if (err)
 		fsverity_err(inode, "Error %d computing block hash", err);
 	return err;
 }
 
+int fsverity_hash_2_blocks(const struct merkle_tree_params *params,
+			   const struct inode *inode, const void *data1,
+			   const void *data2, u8 *out1, u8 *out2)
+{
+	const u8 *data[2] = { data1, data2 };
+	u8 *outs[2] = { out1, out2 };
+	SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm);
+	int err;
+
+	desc->tfm = params->hash_alg->tfm;
+
+	if (params->hashstate)
+		err = crypto_shash_import(desc, params->hashstate);
+	else
+		err = crypto_shash_init(desc);
+	if (err) {
+		fsverity_err(inode, "Error %d importing hash state", err);
+		return err;
+	}
+	err = crypto_shash_finup_mb(desc, data, params->block_size, outs, 2);
+	if (err)
+		fsverity_err(inode, "Error %d computing block hashes", err);
+	return err;
+}
+
 /**
  * fsverity_hash_buffer() - hash some data
  * @alg: the hash algorithm to use
  * @data: the data to hash
  * @size: size of data to hash, in bytes
diff --git a/fs/verity/open.c b/fs/verity/open.c
index fdeb95eca3af..4ae07c689c56 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -206,10 +206,16 @@  struct fsverity_info *fsverity_create_info(const struct inode *inode,
 	if (err) {
 		fsverity_err(inode, "Error %d computing file digest", err);
 		goto fail;
 	}
 
+	err = fsverity_hash_block(&vi->tree_params, inode,
+				  page_address(ZERO_PAGE(0)),
+				  vi->zero_block_hash);
+	if (err)
+		goto fail;
+
 	err = fsverity_verify_signature(vi, desc->signature,
 					le32_to_cpu(desc->sig_size));
 	if (err)
 		goto fail;
 
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 4fcad0825a12..38f1eb3dbd8e 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -77,29 +77,33 @@  static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 	SetPageChecked(hpage);
 	return false;
 }
 
 /*
- * Verify a single data block against the file's Merkle tree.
+ * Verify the hash of a single data block against the file's Merkle tree.
+ *
+ * @real_dblock_hash specifies the hash of the data block, and @data_pos
+ * specifies the byte position of the data block within the file.
  *
  * In principle, we need to verify the entire path to the root node.  However,
  * for efficiency the filesystem may cache the hash blocks.  Therefore we need
  * only ascend the tree until an already-verified hash block is seen, and then
  * verify the path to that block.
  *
  * Return: %true if the data block is valid, else %false.
  */
 static bool
 verify_data_block(struct inode *inode, struct fsverity_info *vi,
-		  const void *data, u64 data_pos, unsigned long max_ra_pages)
+		  const u8 *real_dblock_hash, u64 data_pos,
+		  unsigned long max_ra_pages)
 {
 	const struct merkle_tree_params *params = &vi->tree_params;
 	const unsigned int hsize = params->digest_size;
 	int level;
 	u8 _want_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	const u8 *want_hash;
-	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 real_hblock_hash[FS_VERITY_MAX_DIGEST_SIZE];
 	/* The hash blocks that are traversed, indexed by level */
 	struct {
 		/* Page containing the hash block */
 		struct page *page;
 		/* Mapped address of the hash block (will be within @page) */
@@ -125,11 +129,12 @@  verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		 * doesn't cover data blocks fully past EOF.  But the entire
 		 * page spanning EOF can be visible to userspace via a mmap, and
 		 * any part past EOF should be all zeroes.  Therefore, we need
 		 * to verify that any data blocks fully past EOF are all zeroes.
 		 */
-		if (memchr_inv(data, 0, params->block_size)) {
+		if (memcmp(vi->zero_block_hash, real_dblock_hash,
+			   params->block_size) != 0) {
 			fsverity_err(inode,
 				     "FILE CORRUPTED!  Data past EOF is not zeroed");
 			return false;
 		}
 		return true;
@@ -200,13 +205,14 @@  verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		struct page *hpage = hblocks[level - 1].page;
 		const void *haddr = hblocks[level - 1].addr;
 		unsigned long hblock_idx = hblocks[level - 1].index;
 		unsigned int hoffset = hblocks[level - 1].hoffset;
 
-		if (fsverity_hash_block(params, inode, haddr, real_hash) != 0)
+		if (fsverity_hash_block(params, inode, haddr,
+					real_hblock_hash) != 0)
 			goto error;
-		if (memcmp(want_hash, real_hash, hsize) != 0)
+		if (memcmp(want_hash, real_hblock_hash, hsize) != 0)
 			goto corrupted;
 		/*
 		 * Mark the hash block as verified.  This must be atomic and
 		 * idempotent, as the same hash block might be verified by
 		 * multiple threads concurrently.
@@ -219,55 +225,145 @@  verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		want_hash = _want_hash;
 		kunmap_local(haddr);
 		put_page(hpage);
 	}
 
-	/* Finally, verify the data block. */
-	if (fsverity_hash_block(params, inode, data, real_hash) != 0)
-		goto error;
-	if (memcmp(want_hash, real_hash, hsize) != 0)
+	/* Finally, verify the hash of the data block. */
+	if (memcmp(want_hash, real_dblock_hash, hsize) != 0)
 		goto corrupted;
 	return true;
 
 corrupted:
 	fsverity_err(inode,
 		     "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
 		     data_pos, level - 1,
 		     params->hash_alg->name, hsize, want_hash,
-		     params->hash_alg->name, hsize, real_hash);
+		     params->hash_alg->name, hsize,
+		     level == 0 ? real_dblock_hash : real_hblock_hash);
 error:
 	for (; level > 0; level--) {
 		kunmap_local(hblocks[level - 1].addr);
 		put_page(hblocks[level - 1].page);
 	}
 	return false;
 }
 
+struct fsverity_verification_context {
+	struct inode *inode;
+	struct fsverity_info *vi;
+	unsigned long max_ra_pages;
+
+	/*
+	 * pending_data and pending_pos are used when the selected hash
+	 * algorithm supports multibuffer hashing.  They're used to temporarily
+	 * store the virtual address and position of a mapped data block that
+	 * needs to be verified.  If we then see another data block, we hash the
+	 * two blocks simultaneously using the fast multibuffer hashing method.
+	 */
+	const void *pending_data;
+	u64 pending_pos;
+
+	/* Buffers to temporarily store the calculated data block hashes */
+	u8 hash1[FS_VERITY_MAX_DIGEST_SIZE];
+	u8 hash2[FS_VERITY_MAX_DIGEST_SIZE];
+};
+
+static inline void
+fsverity_init_verification_context(struct fsverity_verification_context *ctx,
+				   struct inode *inode,
+				   unsigned long max_ra_pages)
+{
+	ctx->inode = inode;
+	ctx->vi = inode->i_verity_info;
+	ctx->max_ra_pages = max_ra_pages;
+	ctx->pending_data = NULL;
+}
+
+static bool
+fsverity_finish_verification(struct fsverity_verification_context *ctx)
+{
+	int err;
+
+	if (ctx->pending_data == NULL)
+		return true;
+	/*
+	 * Multibuffer hashing is enabled but there was an odd number of data
+	 * blocks.  Hash and verify the last block by itself.
+	 */
+	err = fsverity_hash_block(&ctx->vi->tree_params, ctx->inode,
+				  ctx->pending_data, ctx->hash1);
+	kunmap_local(ctx->pending_data);
+	ctx->pending_data = NULL;
+	return err == 0 &&
+	       verify_data_block(ctx->inode, ctx->vi, ctx->hash1,
+				 ctx->pending_pos, ctx->max_ra_pages);
+}
+
+static inline void
+fsverity_abort_verification(struct fsverity_verification_context *ctx)
+{
+	if (ctx->pending_data) {
+		kunmap_local(ctx->pending_data);
+		ctx->pending_data = NULL;
+	}
+}
+
 static bool
-verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
-		   unsigned long max_ra_pages)
+fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
+			 struct folio *data_folio, size_t len, size_t offset)
 {
-	struct inode *inode = data_folio->mapping->host;
-	struct fsverity_info *vi = inode->i_verity_info;
-	const unsigned int block_size = vi->tree_params.block_size;
-	u64 pos = (u64)data_folio->index << PAGE_SHIFT;
+	struct inode *inode = ctx->inode;
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int block_size = params->block_size;
+	const bool multibuffer = params->hash_alg->supports_multibuffer;
+	u64 pos = ((u64)data_folio->index << PAGE_SHIFT) + offset;
 
 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
 		return false;
 	if (WARN_ON_ONCE(!folio_test_locked(data_folio) ||
 			 folio_test_uptodate(data_folio)))
 		return false;
 	do {
-		void *data;
-		bool valid;
-
-		data = kmap_local_folio(data_folio, offset);
-		valid = verify_data_block(inode, vi, data, pos + offset,
-					  max_ra_pages);
-		kunmap_local(data);
-		if (!valid)
-			return false;
+		const void *data = kmap_local_folio(data_folio, offset);
+		int err;
+
+		if (multibuffer) {
+			if (ctx->pending_data) {
+				/* Hash and verify two data blocks. */
+				err = fsverity_hash_2_blocks(params,
+							     inode,
+							     ctx->pending_data,
+							     data,
+							     ctx->hash1,
+							     ctx->hash2);
+				kunmap_local(data);
+				kunmap_local(ctx->pending_data);
+				ctx->pending_data = NULL;
+				if (err != 0 ||
+				    !verify_data_block(inode, vi, ctx->hash1,
+						       ctx->pending_pos,
+						       ctx->max_ra_pages) ||
+				    !verify_data_block(inode, vi, ctx->hash2,
+						       pos, ctx->max_ra_pages))
+					return false;
+			} else {
+				/* Wait and see if there's another block. */
+				ctx->pending_data = data;
+				ctx->pending_pos = pos;
+			}
+		} else {
+			/* Hash and verify one data block. */
+			err = fsverity_hash_block(params, inode, data,
+						  ctx->hash1);
+			kunmap_local(data);
+			if (err != 0 ||
+			    !verify_data_block(inode, vi, ctx->hash1,
+					       pos, ctx->max_ra_pages))
+				return false;
+		}
+		pos += block_size;
 		offset += block_size;
 		len -= block_size;
 	} while (len);
 	return true;
 }
@@ -284,11 +380,19 @@  verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
  *
  * Return: %true if the data is valid, else %false.
  */
 bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
-	return verify_data_blocks(folio, len, offset, 0);
+	struct fsverity_verification_context ctx;
+
+	fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+
+	if (!fsverity_add_data_blocks(&ctx, folio, len, offset)) {
+		fsverity_abort_verification(&ctx);
+		return false;
+	}
+	return fsverity_finish_verification(&ctx);
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
 
 #ifdef CONFIG_BLOCK
 /**
@@ -305,10 +409,12 @@  EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
  * filesystems) must instead call fsverity_verify_page() directly on each page.
  * All filesystems must also call fsverity_verify_page() on holes.
  */
 void fsverity_verify_bio(struct bio *bio)
 {
+	struct inode *inode = bio_first_folio_all(bio)->mapping->host;
+	struct fsverity_verification_context ctx;
 	struct folio_iter fi;
 	unsigned long max_ra_pages = 0;
 
 	if (bio->bi_opf & REQ_RAHEAD) {
 		/*
@@ -321,17 +427,26 @@  void fsverity_verify_bio(struct bio *bio)
 		 * reduces the number of I/O requests made to the Merkle tree.
 		 */
 		max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
 	}
 
+	fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+
 	bio_for_each_folio_all(fi, bio) {
-		if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
-					max_ra_pages)) {
-			bio->bi_status = BLK_STS_IOERR;
-			break;
+		if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
+					      fi.offset)) {
+			fsverity_abort_verification(&ctx);
+			goto ioerr;
 		}
 	}
+
+	if (!fsverity_finish_verification(&ctx))
+		goto ioerr;
+	return;
+
+ioerr:
+	bio->bi_status = BLK_STS_IOERR;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_bio);
 #endif /* CONFIG_BLOCK */
 
 /**