diff mbox series

[v2,21/23] xfs: handle merkle tree block size != fs blocksize != PAGE_SIZE

Message ID 20230404145319.2057051-22-aalbersh@redhat.com (mailing list archive)
State Deferred, archived
Headers show
Series fs-verity support for XFS | expand

Commit Message

Andrey Albershteyn April 4, 2023, 2:53 p.m. UTC
In case of different Merkle tree block size fs-verity expects
->read_merkle_tree_page() to return Merkle tree page filled with
Merkle tree blocks. The XFS stores each merkle tree block under
extended attribute. Those attributes are addressed by block offset
into Merkle tree.

This patch make ->read_merkle_tree_page() to fetch multiple merkle
tree blocks based on size ratio. Also the reference to each xfs_buf
is passed with page->private to ->drop_page().

Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
---
 fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_verity.h |  8 +++++
 2 files changed, 66 insertions(+), 16 deletions(-)

Comments

Darrick J. Wong April 4, 2023, 4:36 p.m. UTC | #1
On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> In case of different Merkle tree block size fs-verity expects
> ->read_merkle_tree_page() to return Merkle tree page filled with
> Merkle tree blocks. The XFS stores each merkle tree block under
> extended attribute. Those attributes are addressed by block offset
> into Merkle tree.
> 
> This patch make ->read_merkle_tree_page() to fetch multiple merkle
> tree blocks based on size ratio. Also the reference to each xfs_buf
> is passed with page->private to ->drop_page().
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
>  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
>  fs/xfs/xfs_verity.h |  8 +++++
>  2 files changed, 66 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> index a9874ff4efcd..ef0aff216f06 100644
> --- a/fs/xfs/xfs_verity.c
> +++ b/fs/xfs/xfs_verity.c
> @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
>  	struct page		*page = NULL;
>  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
>  	uint32_t		bs = 1 << log_blocksize;
> +	int			blocks_per_page =
> +		(1 << (PAGE_SHIFT - log_blocksize));
> +	int			n = 0;
> +	int			offset = 0;
>  	struct xfs_da_args	args = {
>  		.dp		= ip,
>  		.attr_filter	= XFS_ATTR_VERITY,
> @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
>  		.valuelen	= bs,
>  	};
>  	int			error = 0;
> +	bool			is_checked = true;
> +	struct xfs_verity_buf_list	*buf_list;
>  
>  	page = alloc_page(GFP_KERNEL);
>  	if (!page)
>  		return ERR_PTR(-ENOMEM);
>  
> -	error = xfs_attr_get(&args);
> -	if (error) {
> -		kmem_free(args.value);
> -		xfs_buf_rele(args.bp);
> +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> +	if (!buf_list) {
>  		put_page(page);
> -		return ERR_PTR(-EFAULT);
> +		return ERR_PTR(-ENOMEM);
>  	}
>  
> -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> +	/*
> +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> +	 * PAGE SIZE
> +	 */
> +	for (n = 0; n < blocks_per_page; n++) {

Ahah, ok, that's why we can't pass the xfs_buf pages up to fsverity.

> +		offset = bs * n;
> +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));

Really this ought to be a typechecked helper...

struct xfs_fsverity_merkle_key {
	__be64	merkleoff;
};

static inline void
xfs_fsverity_merkle_key_to_disk(struct xfs_fsverity_merkle_key *k, loff_t pos)
{
	k->merkeloff = cpu_to_be64(pos);
}



> +		args.name = (const uint8_t *)&name;
> +
> +		error = xfs_attr_get(&args);
> +		if (error) {
> +			kmem_free(args.value);
> +			/*
> +			 * No more Merkle tree blocks (e.g. this was the last
> +			 * block of the tree)
> +			 */
> +			if (error == -ENOATTR)
> +				break;
> +			xfs_buf_rele(args.bp);
> +			put_page(page);
> +			kmem_free(buf_list);
> +			return ERR_PTR(-EFAULT);
> +		}
> +
> +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> +
> +		/* One of the buffers was dropped */
> +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> +			is_checked = false;

If there's enough memory pressure to cause the merkle tree pages to get
evicted, what are the chances that the xfs_bufs survive the eviction?

> +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> +		kmem_free(args.value);
> +		args.value = NULL;
> +	}
> +
> +	if (is_checked)
>  		SetPageChecked(page);
> +	page->private = (unsigned long)buf_list;
>  
> -	page->private = (unsigned long)args.bp;
> -	memcpy(page_address(page), args.value, args.valuelen);
> -
> -	kmem_free(args.value);
>  	return page;
>  }
>  
> @@ -191,16 +228,21 @@ xfs_write_merkle_tree_block(
>  
>  static void
>  xfs_drop_page(
> -	struct page	*page)
> +	struct page			*page)
>  {
> -	struct xfs_buf *buf = (struct xfs_buf *)page->private;
> +	int				i = 0;
> +	struct xfs_verity_buf_list	*buf_list =
> +		(struct xfs_verity_buf_list *)page->private;
>  
> -	ASSERT(buf != NULL);
> +	ASSERT(buf_list != NULL);
>  
> -	if (PageChecked(page))
> -		buf->b_flags |= XBF_VERITY_CHECKED;
> +	for (i = 0; i < buf_list->buf_count; i++) {
> +		if (PageChecked(page))
> +			buf_list->bufs[i]->b_flags |= XBF_VERITY_CHECKED;
> +		xfs_buf_rele(buf_list->bufs[i]);
> +	}
>  
> -	xfs_buf_rele(buf);
> +	kmem_free(buf_list);
>  	put_page(page);
>  }
>  
> diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
> index ae5d87ca32a8..433b2f4ae3bc 100644
> --- a/fs/xfs/xfs_verity.h
> +++ b/fs/xfs/xfs_verity.h
> @@ -16,4 +16,12 @@ extern const struct fsverity_operations xfs_verity_ops;
>  #define xfs_verity_ops NULL
>  #endif	/* CONFIG_FS_VERITY */
>  
> +/* Minimal Merkle tree block size is 1024 */
> +#define XFS_VERITY_MAX_MBLOCKS_PER_PAGE (1 << (PAGE_SHIFT - 10))
> +
> +struct xfs_verity_buf_list {
> +	unsigned int	buf_count;
> +	struct xfs_buf	*bufs[XFS_VERITY_MAX_MBLOCKS_PER_PAGE];

So... this is going to be a 520-byte allocation on arm64 with 64k pages?
Even if the merkle tree block size is the same as the page size?  Ouch.

--D

> +};
> +
>  #endif	/* __XFS_VERITY_H__ */
> -- 
> 2.38.4
>
Eric Biggers April 4, 2023, 11:32 p.m. UTC | #2
Hi Andrey,

On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> In case of different Merkle tree block size fs-verity expects
> ->read_merkle_tree_page() to return Merkle tree page filled with
> Merkle tree blocks. The XFS stores each merkle tree block under
> extended attribute. Those attributes are addressed by block offset
> into Merkle tree.
> 
> This patch make ->read_merkle_tree_page() to fetch multiple merkle
> tree blocks based on size ratio. Also the reference to each xfs_buf
> is passed with page->private to ->drop_page().
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> ---
>  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
>  fs/xfs/xfs_verity.h |  8 +++++
>  2 files changed, 66 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> index a9874ff4efcd..ef0aff216f06 100644
> --- a/fs/xfs/xfs_verity.c
> +++ b/fs/xfs/xfs_verity.c
> @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
>  	struct page		*page = NULL;
>  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
>  	uint32_t		bs = 1 << log_blocksize;
> +	int			blocks_per_page =
> +		(1 << (PAGE_SHIFT - log_blocksize));
> +	int			n = 0;
> +	int			offset = 0;
>  	struct xfs_da_args	args = {
>  		.dp		= ip,
>  		.attr_filter	= XFS_ATTR_VERITY,
> @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
>  		.valuelen	= bs,
>  	};
>  	int			error = 0;
> +	bool			is_checked = true;
> +	struct xfs_verity_buf_list	*buf_list;
>  
>  	page = alloc_page(GFP_KERNEL);
>  	if (!page)
>  		return ERR_PTR(-ENOMEM);
>  
> -	error = xfs_attr_get(&args);
> -	if (error) {
> -		kmem_free(args.value);
> -		xfs_buf_rele(args.bp);
> +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> +	if (!buf_list) {
>  		put_page(page);
> -		return ERR_PTR(-EFAULT);
> +		return ERR_PTR(-ENOMEM);
>  	}
>  
> -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> +	/*
> +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> +	 * PAGE SIZE
> +	 */
> +	for (n = 0; n < blocks_per_page; n++) {
> +		offset = bs * n;
> +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
> +		args.name = (const uint8_t *)&name;
> +
> +		error = xfs_attr_get(&args);
> +		if (error) {
> +			kmem_free(args.value);
> +			/*
> +			 * No more Merkle tree blocks (e.g. this was the last
> +			 * block of the tree)
> +			 */
> +			if (error == -ENOATTR)
> +				break;
> +			xfs_buf_rele(args.bp);
> +			put_page(page);
> +			kmem_free(buf_list);
> +			return ERR_PTR(-EFAULT);
> +		}
> +
> +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> +
> +		/* One of the buffers was dropped */
> +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> +			is_checked = false;
> +
> +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> +		kmem_free(args.value);
> +		args.value = NULL;
> +	}

I was really hoping for a solution where the cached data can be used directly,
instead allocating a temporary page and copying the cached data into it every
time the cache is accessed.  The problem with what you have now is that every
time a single 32-byte hash is accessed, a full page (potentially 64KB!) will be
allocated and filled.  That's not very efficient.  The need to allocate the
temporary page can also cause ENOMEM (which will get reported as EIO).

Did you consider alternatives that would work more efficiently?  I think it
would be worth designing something that works properly with how XFS is planned
to cache the Merkle tree, instead of designing a workaround.
->read_merkle_tree_page was not really designed for what you are doing here.

How about replacing ->read_merkle_tree_page with a function that takes in a
Merkle tree block index (not a page index!) and hands back a (page, offset) pair
that identifies where the Merkle tree block's data is located?  Or (folio,
offset), I suppose.

With that, would it be possible to directly return the XFS cache?

- Eric
Andrey Albershteyn April 5, 2023, 3:12 p.m. UTC | #3
Hi Eric,

On Tue, Apr 04, 2023 at 04:32:24PM -0700, Eric Biggers wrote:
> Hi Andrey,
> 
> On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> > In case of different Merkle tree block size fs-verity expects
> > ->read_merkle_tree_page() to return Merkle tree page filled with
> > Merkle tree blocks. The XFS stores each merkle tree block under
> > extended attribute. Those attributes are addressed by block offset
> > into Merkle tree.
> > 
> > This patch make ->read_merkle_tree_page() to fetch multiple merkle
> > tree blocks based on size ratio. Also the reference to each xfs_buf
> > is passed with page->private to ->drop_page().
> > 
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
> >  fs/xfs/xfs_verity.h |  8 +++++
> >  2 files changed, 66 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> > index a9874ff4efcd..ef0aff216f06 100644
> > --- a/fs/xfs/xfs_verity.c
> > +++ b/fs/xfs/xfs_verity.c
> > @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
> >  	struct page		*page = NULL;
> >  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
> >  	uint32_t		bs = 1 << log_blocksize;
> > +	int			blocks_per_page =
> > +		(1 << (PAGE_SHIFT - log_blocksize));
> > +	int			n = 0;
> > +	int			offset = 0;
> >  	struct xfs_da_args	args = {
> >  		.dp		= ip,
> >  		.attr_filter	= XFS_ATTR_VERITY,
> > @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
> >  		.valuelen	= bs,
> >  	};
> >  	int			error = 0;
> > +	bool			is_checked = true;
> > +	struct xfs_verity_buf_list	*buf_list;
> >  
> >  	page = alloc_page(GFP_KERNEL);
> >  	if (!page)
> >  		return ERR_PTR(-ENOMEM);
> >  
> > -	error = xfs_attr_get(&args);
> > -	if (error) {
> > -		kmem_free(args.value);
> > -		xfs_buf_rele(args.bp);
> > +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> > +	if (!buf_list) {
> >  		put_page(page);
> > -		return ERR_PTR(-EFAULT);
> > +		return ERR_PTR(-ENOMEM);
> >  	}
> >  
> > -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> > +	/*
> > +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> > +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> > +	 * PAGE SIZE
> > +	 */
> > +	for (n = 0; n < blocks_per_page; n++) {
> > +		offset = bs * n;
> > +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
> > +		args.name = (const uint8_t *)&name;
> > +
> > +		error = xfs_attr_get(&args);
> > +		if (error) {
> > +			kmem_free(args.value);
> > +			/*
> > +			 * No more Merkle tree blocks (e.g. this was the last
> > +			 * block of the tree)
> > +			 */
> > +			if (error == -ENOATTR)
> > +				break;
> > +			xfs_buf_rele(args.bp);
> > +			put_page(page);
> > +			kmem_free(buf_list);
> > +			return ERR_PTR(-EFAULT);
> > +		}
> > +
> > +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> > +
> > +		/* One of the buffers was dropped */
> > +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> > +			is_checked = false;
> > +
> > +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> > +		kmem_free(args.value);
> > +		args.value = NULL;
> > +	}
> 
> I was really hoping for a solution where the cached data can be used directly,
> instead allocating a temporary page and copying the cached data into it every
> time the cache is accessed.  The problem with what you have now is that every
> time a single 32-byte hash is accessed, a full page (potentially 64KB!) will be
> allocated and filled.  That's not very efficient.  The need to allocate the
> temporary page can also cause ENOMEM (which will get reported as EIO).
> 
> Did you consider alternatives that would work more efficiently?  I think it
> would be worth designing something that works properly with how XFS is planned
> to cache the Merkle tree, instead of designing a workaround.
> ->read_merkle_tree_page was not really designed for what you are doing here.
> 
> How about replacing ->read_merkle_tree_page with a function that takes in a
> Merkle tree block index (not a page index!) and hands back a (page, offset) pair
> that identifies where the Merkle tree block's data is located?  Or (folio,
> offset), I suppose.
> 
> With that, would it be possible to directly return the XFS cache?
> 
> - Eric
> 

Yeah, I also don't like it, I didn't want to change fs-verity much
so went with this workaround. But as it's ok, I will look into trying
to pass xfs buffers to fs-verity without direct use of
->read_merkle_tree_page(). I think it's possible with (folio,
offset), the xfs buffers aren't xattr value align so the 4k merkle
tree block is stored in two pages.

Thanks for suggestion!
Andrey Albershteyn April 5, 2023, 4:02 p.m. UTC | #4
Hi Darrick,

On Tue, Apr 04, 2023 at 09:36:02AM -0700, Darrick J. Wong wrote:
> On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> > In case of different Merkle tree block size fs-verity expects
> > ->read_merkle_tree_page() to return Merkle tree page filled with
> > Merkle tree blocks. The XFS stores each merkle tree block under
> > extended attribute. Those attributes are addressed by block offset
> > into Merkle tree.
> > 
> > This patch make ->read_merkle_tree_page() to fetch multiple merkle
> > tree blocks based on size ratio. Also the reference to each xfs_buf
> > is passed with page->private to ->drop_page().
> > 
> > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > ---
> >  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
> >  fs/xfs/xfs_verity.h |  8 +++++
> >  2 files changed, 66 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> > index a9874ff4efcd..ef0aff216f06 100644
> > --- a/fs/xfs/xfs_verity.c
> > +++ b/fs/xfs/xfs_verity.c
> > @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
> >  	struct page		*page = NULL;
> >  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
> >  	uint32_t		bs = 1 << log_blocksize;
> > +	int			blocks_per_page =
> > +		(1 << (PAGE_SHIFT - log_blocksize));
> > +	int			n = 0;
> > +	int			offset = 0;
> >  	struct xfs_da_args	args = {
> >  		.dp		= ip,
> >  		.attr_filter	= XFS_ATTR_VERITY,
> > @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
> >  		.valuelen	= bs,
> >  	};
> >  	int			error = 0;
> > +	bool			is_checked = true;
> > +	struct xfs_verity_buf_list	*buf_list;
> >  
> >  	page = alloc_page(GFP_KERNEL);
> >  	if (!page)
> >  		return ERR_PTR(-ENOMEM);
> >  
> > -	error = xfs_attr_get(&args);
> > -	if (error) {
> > -		kmem_free(args.value);
> > -		xfs_buf_rele(args.bp);
> > +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> > +	if (!buf_list) {
> >  		put_page(page);
> > -		return ERR_PTR(-EFAULT);
> > +		return ERR_PTR(-ENOMEM);
> >  	}
> >  
> > -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> > +	/*
> > +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> > +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> > +	 * PAGE SIZE
> > +	 */
> > +	for (n = 0; n < blocks_per_page; n++) {
> 
> Ahah, ok, that's why we can't pass the xfs_buf pages up to fsverity.
> 
> > +		offset = bs * n;
> > +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
> 
> Really this ought to be a typechecked helper...
> 
> struct xfs_fsverity_merkle_key {
> 	__be64	merkleoff;

Sure, thanks, will change this

> };
> 
> static inline void
> xfs_fsverity_merkle_key_to_disk(struct xfs_fsverity_merkle_key *k, loff_t pos)
> {
> 	k->merkeloff = cpu_to_be64(pos);
> }
> 
> 
> 
> > +		args.name = (const uint8_t *)&name;
> > +
> > +		error = xfs_attr_get(&args);
> > +		if (error) {
> > +			kmem_free(args.value);
> > +			/*
> > +			 * No more Merkle tree blocks (e.g. this was the last
> > +			 * block of the tree)
> > +			 */
> > +			if (error == -ENOATTR)
> > +				break;
> > +			xfs_buf_rele(args.bp);
> > +			put_page(page);
> > +			kmem_free(buf_list);
> > +			return ERR_PTR(-EFAULT);
> > +		}
> > +
> > +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> > +
> > +		/* One of the buffers was dropped */
> > +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> > +			is_checked = false;
> 
> If there's enough memory pressure to cause the merkle tree pages to get
> evicted, what are the chances that the xfs_bufs survive the eviction?

The merkle tree pages are dropped after verification. When page is
dropped xfs_buf is marked as verified. If fs-verity wants to
verify again it will get the same verified buffer. If buffer is
evicted it won't have verified state.

So, with enough memory pressure buffers will be dropped and need to
be reverified.

> 
> > +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> > +		kmem_free(args.value);
> > +		args.value = NULL;
> > +	}
> > +
> > +	if (is_checked)
> >  		SetPageChecked(page);
> > +	page->private = (unsigned long)buf_list;
> >  
> > -	page->private = (unsigned long)args.bp;
> > -	memcpy(page_address(page), args.value, args.valuelen);
> > -
> > -	kmem_free(args.value);
> >  	return page;
> >  }
> >  
> > @@ -191,16 +228,21 @@ xfs_write_merkle_tree_block(
> >  
> >  static void
> >  xfs_drop_page(
> > -	struct page	*page)
> > +	struct page			*page)
> >  {
> > -	struct xfs_buf *buf = (struct xfs_buf *)page->private;
> > +	int				i = 0;
> > +	struct xfs_verity_buf_list	*buf_list =
> > +		(struct xfs_verity_buf_list *)page->private;
> >  
> > -	ASSERT(buf != NULL);
> > +	ASSERT(buf_list != NULL);
> >  
> > -	if (PageChecked(page))
> > -		buf->b_flags |= XBF_VERITY_CHECKED;
> > +	for (i = 0; i < buf_list->buf_count; i++) {
> > +		if (PageChecked(page))
> > +			buf_list->bufs[i]->b_flags |= XBF_VERITY_CHECKED;
> > +		xfs_buf_rele(buf_list->bufs[i]);
> > +	}
> >  
> > -	xfs_buf_rele(buf);
> > +	kmem_free(buf_list);
> >  	put_page(page);
> >  }
> >  
> > diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
> > index ae5d87ca32a8..433b2f4ae3bc 100644
> > --- a/fs/xfs/xfs_verity.h
> > +++ b/fs/xfs/xfs_verity.h
> > @@ -16,4 +16,12 @@ extern const struct fsverity_operations xfs_verity_ops;
> >  #define xfs_verity_ops NULL
> >  #endif	/* CONFIG_FS_VERITY */
> >  
> > +/* Minimal Merkle tree block size is 1024 */
> > +#define XFS_VERITY_MAX_MBLOCKS_PER_PAGE (1 << (PAGE_SHIFT - 10))
> > +
> > +struct xfs_verity_buf_list {
> > +	unsigned int	buf_count;
> > +	struct xfs_buf	*bufs[XFS_VERITY_MAX_MBLOCKS_PER_PAGE];
> 
> So... this is going to be a 520-byte allocation on arm64 with 64k pages?
> Even if the merkle tree block size is the same as the page size?  Ouch.

yeah, it also allocates a page and is dropped with the page, so,
I took it as an addition to already big chunk of memory. But I
probably will change it, viz. comment from Eric on this patch.
Darrick J. Wong April 5, 2023, 4:38 p.m. UTC | #5
On Wed, Apr 05, 2023 at 06:02:21PM +0200, Andrey Albershteyn wrote:
> Hi Darrick,
> 
> On Tue, Apr 04, 2023 at 09:36:02AM -0700, Darrick J. Wong wrote:
> > On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> > > In case of different Merkle tree block size fs-verity expects
> > > ->read_merkle_tree_page() to return Merkle tree page filled with
> > > Merkle tree blocks. The XFS stores each merkle tree block under
> > > extended attribute. Those attributes are addressed by block offset
> > > into Merkle tree.
> > > 
> > > This patch make ->read_merkle_tree_page() to fetch multiple merkle
> > > tree blocks based on size ratio. Also the reference to each xfs_buf
> > > is passed with page->private to ->drop_page().
> > > 
> > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > ---
> > >  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
> > >  fs/xfs/xfs_verity.h |  8 +++++
> > >  2 files changed, 66 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> > > index a9874ff4efcd..ef0aff216f06 100644
> > > --- a/fs/xfs/xfs_verity.c
> > > +++ b/fs/xfs/xfs_verity.c
> > > @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
> > >  	struct page		*page = NULL;
> > >  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
> > >  	uint32_t		bs = 1 << log_blocksize;
> > > +	int			blocks_per_page =
> > > +		(1 << (PAGE_SHIFT - log_blocksize));
> > > +	int			n = 0;
> > > +	int			offset = 0;
> > >  	struct xfs_da_args	args = {
> > >  		.dp		= ip,
> > >  		.attr_filter	= XFS_ATTR_VERITY,
> > > @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
> > >  		.valuelen	= bs,
> > >  	};
> > >  	int			error = 0;
> > > +	bool			is_checked = true;
> > > +	struct xfs_verity_buf_list	*buf_list;
> > >  
> > >  	page = alloc_page(GFP_KERNEL);
> > >  	if (!page)
> > >  		return ERR_PTR(-ENOMEM);
> > >  
> > > -	error = xfs_attr_get(&args);
> > > -	if (error) {
> > > -		kmem_free(args.value);
> > > -		xfs_buf_rele(args.bp);
> > > +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> > > +	if (!buf_list) {
> > >  		put_page(page);
> > > -		return ERR_PTR(-EFAULT);
> > > +		return ERR_PTR(-ENOMEM);
> > >  	}
> > >  
> > > -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> > > +	/*
> > > +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> > > +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> > > +	 * PAGE SIZE
> > > +	 */
> > > +	for (n = 0; n < blocks_per_page; n++) {
> > 
> > Ahah, ok, that's why we can't pass the xfs_buf pages up to fsverity.
> > 
> > > +		offset = bs * n;
> > > +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
> > 
> > Really this ought to be a typechecked helper...
> > 
> > struct xfs_fsverity_merkle_key {
> > 	__be64	merkleoff;
> 
> Sure, thanks, will change this
> 
> > };
> > 
> > static inline void
> > xfs_fsverity_merkle_key_to_disk(struct xfs_fsverity_merkle_key *k, loff_t pos)
> > {
> > 	k->merkeloff = cpu_to_be64(pos);
> > }
> > 
> > 
> > 
> > > +		args.name = (const uint8_t *)&name;
> > > +
> > > +		error = xfs_attr_get(&args);
> > > +		if (error) {
> > > +			kmem_free(args.value);
> > > +			/*
> > > +			 * No more Merkle tree blocks (e.g. this was the last
> > > +			 * block of the tree)
> > > +			 */
> > > +			if (error == -ENOATTR)
> > > +				break;
> > > +			xfs_buf_rele(args.bp);
> > > +			put_page(page);
> > > +			kmem_free(buf_list);
> > > +			return ERR_PTR(-EFAULT);
> > > +		}
> > > +
> > > +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> > > +
> > > +		/* One of the buffers was dropped */
> > > +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> > > +			is_checked = false;
> > 
> > If there's enough memory pressure to cause the merkle tree pages to get
> > evicted, what are the chances that the xfs_bufs survive the eviction?
> 
> The merkle tree pages are dropped after verification. When page is
> dropped xfs_buf is marked as verified. If fs-verity wants to
> verify again it will get the same verified buffer. If buffer is
> evicted it won't have verified state.
> 
> So, with enough memory pressure buffers will be dropped and need to
> be reverified.

Please excuse me if this was discussed and rejected long ago, but
perhaps fsverity should try to hang on to the merkle tree pages that
this function returns for as long as possible until reclaim comes for
them?

With the merkle tree page lifetimes extended, you then don't need to
attach the xfs_buf to page->private, nor does xfs have to extend the
buffer cache to stash XBF_VERITY_CHECKED.

Also kinda wondering why you don't allocate the page, kmap it, and then
pass that address into args->value to avoid the third memory allocation
that gets done inside xfs_attr_get?

--D

> > 
> > > +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> > > +		kmem_free(args.value);
> > > +		args.value = NULL;
> > > +	}
> > > +
> > > +	if (is_checked)
> > >  		SetPageChecked(page);
> > > +	page->private = (unsigned long)buf_list;
> > >  
> > > -	page->private = (unsigned long)args.bp;
> > > -	memcpy(page_address(page), args.value, args.valuelen);
> > > -
> > > -	kmem_free(args.value);
> > >  	return page;
> > >  }
> > >  
> > > @@ -191,16 +228,21 @@ xfs_write_merkle_tree_block(
> > >  
> > >  static void
> > >  xfs_drop_page(
> > > -	struct page	*page)
> > > +	struct page			*page)
> > >  {
> > > -	struct xfs_buf *buf = (struct xfs_buf *)page->private;
> > > +	int				i = 0;
> > > +	struct xfs_verity_buf_list	*buf_list =
> > > +		(struct xfs_verity_buf_list *)page->private;
> > >  
> > > -	ASSERT(buf != NULL);
> > > +	ASSERT(buf_list != NULL);
> > >  
> > > -	if (PageChecked(page))
> > > -		buf->b_flags |= XBF_VERITY_CHECKED;
> > > +	for (i = 0; i < buf_list->buf_count; i++) {
> > > +		if (PageChecked(page))
> > > +			buf_list->bufs[i]->b_flags |= XBF_VERITY_CHECKED;
> > > +		xfs_buf_rele(buf_list->bufs[i]);
> > > +	}
> > >  
> > > -	xfs_buf_rele(buf);
> > > +	kmem_free(buf_list);
> > >  	put_page(page);
> > >  }
> > >  
> > > diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
> > > index ae5d87ca32a8..433b2f4ae3bc 100644
> > > --- a/fs/xfs/xfs_verity.h
> > > +++ b/fs/xfs/xfs_verity.h
> > > @@ -16,4 +16,12 @@ extern const struct fsverity_operations xfs_verity_ops;
> > >  #define xfs_verity_ops NULL
> > >  #endif	/* CONFIG_FS_VERITY */
> > >  
> > > +/* Minimal Merkle tree block size is 1024 */
> > > +#define XFS_VERITY_MAX_MBLOCKS_PER_PAGE (1 << (PAGE_SHIFT - 10))
> > > +
> > > +struct xfs_verity_buf_list {
> > > +	unsigned int	buf_count;
> > > +	struct xfs_buf	*bufs[XFS_VERITY_MAX_MBLOCKS_PER_PAGE];
> > 
> > So... this is going to be a 520-byte allocation on arm64 with 64k pages?
> > Even if the merkle tree block size is the same as the page size?  Ouch.
> 
> yeah, it also allocates a page and is dropped with the page, so,
> I took it as an addition to already big chunk of memory. But I
> probably will change it, viz. comment from Eric on this patch.
> 
> -- 
> - Andrey
>
Eric Biggers April 5, 2023, 6:16 p.m. UTC | #6
On Wed, Apr 05, 2023 at 09:38:47AM -0700, Darrick J. Wong wrote:
> > The merkle tree pages are dropped after verification. When page is
> > dropped xfs_buf is marked as verified. If fs-verity wants to
> > verify again it will get the same verified buffer. If buffer is
> > evicted it won't have verified state.
> > 
> > So, with enough memory pressure buffers will be dropped and need to
> > be reverified.
> 
> Please excuse me if this was discussed and rejected long ago, but
> perhaps fsverity should try to hang on to the merkle tree pages that
> this function returns for as long as possible until reclaim comes for
> them?
> 
> With the merkle tree page lifetimes extended, you then don't need to
> attach the xfs_buf to page->private, nor does xfs have to extend the
> buffer cache to stash XBF_VERITY_CHECKED.

Well, all the other filesystems that support fsverity (ext4, f2fs, and btrfs)
just cache the Merkle tree pages in the inode's page cache.  It's an approach
that I know some people aren't a fan of, but it's efficient and it works.

We could certainly think about moving to a design where fs/verity/ asks the
filesystem to just *read* a Merkle tree block, without adding it to a cache, and
then fs/verity/ implements the caching itself.  That would require some large
changes to each filesystem, though, unless we were to double-cache the Merkle
tree blocks which would be inefficient.

So it feels like continuing to have the filesystem (not fs/verity/) be
responsible for the cache is the best way to allow XFS to do things a bit
differently, without regressing the other filesystems.

I'm interested in hearing any other proposals, though.

- Eric
Dave Chinner April 5, 2023, 10:26 p.m. UTC | #7
On Wed, Apr 05, 2023 at 06:16:00PM +0000, Eric Biggers wrote:
> On Wed, Apr 05, 2023 at 09:38:47AM -0700, Darrick J. Wong wrote:
> > > The merkle tree pages are dropped after verification. When page is
> > > dropped xfs_buf is marked as verified. If fs-verity wants to
> > > verify again it will get the same verified buffer. If buffer is
> > > evicted it won't have verified state.
> > > 
> > > So, with enough memory pressure buffers will be dropped and need to
> > > be reverified.
> > 
> > Please excuse me if this was discussed and rejected long ago, but
> > perhaps fsverity should try to hang on to the merkle tree pages that
> > this function returns for as long as possible until reclaim comes for
> > them?
> > 
> > With the merkle tree page lifetimes extended, you then don't need to
> > attach the xfs_buf to page->private, nor does xfs have to extend the
> > buffer cache to stash XBF_VERITY_CHECKED.
> 
> Well, all the other filesystems that support fsverity (ext4, f2fs, and btrfs)
> just cache the Merkle tree pages in the inode's page cache.  It's an approach
> that I know some people aren't a fan of, but it's efficient and it works.

Which puts pages beyond EOF in the page cache. Given that XFS also
allows persistent block allocation beyond EOF, having both data in the page
cache and blocks beyond EOF that contain unrelated information is a
Real Bad Idea.

Just because putting metadata in the file data address space works
for one filesystem, it doesn't me it's a good idea or that it works
for every filesystem.

> We could certainly think about moving to a design where fs/verity/ asks the
> filesystem to just *read* a Merkle tree block, without adding it to a cache, and
> then fs/verity/ implements the caching itself.  That would require some large
> changes to each filesystem, though, unless we were to double-cache the Merkle
> tree blocks which would be inefficient.

No, that's unnecessary.

All we need if for fsverity to require filesystems to pass it byte
addressable data buffers that are externally reference counted. The
filesystem can take a page reference before mapping the page and
passing the kaddr to fsverity, then unmap and drop the reference
when the merkle tree walk is done as per Andrey's new drop callout.

fsverity doesn't need to care what the buffer is made from, how it
is cached, what it's life cycle is, etc. The caching mechanism and
reference counting is entirely controlled by the filesystem callout
implementations, and fsverity only needs to deal with memory buffers
that are guaranteed to live for the entire walk of the merkle
tree....

Cheers,

Dave.
Dave Chinner April 5, 2023, 10:51 p.m. UTC | #8
On Wed, Apr 05, 2023 at 05:12:34PM +0200, Andrey Albershteyn wrote:
> Hi Eric,
> 
> On Tue, Apr 04, 2023 at 04:32:24PM -0700, Eric Biggers wrote:
> > Hi Andrey,
> > 
> > On Tue, Apr 04, 2023 at 04:53:17PM +0200, Andrey Albershteyn wrote:
> > > In case of different Merkle tree block size fs-verity expects
> > > ->read_merkle_tree_page() to return Merkle tree page filled with
> > > Merkle tree blocks. The XFS stores each merkle tree block under
> > > extended attribute. Those attributes are addressed by block offset
> > > into Merkle tree.
> > > 
> > > This patch make ->read_merkle_tree_page() to fetch multiple merkle
> > > tree blocks based on size ratio. Also the reference to each xfs_buf
> > > is passed with page->private to ->drop_page().
> > > 
> > > Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
> > > ---
> > >  fs/xfs/xfs_verity.c | 74 +++++++++++++++++++++++++++++++++++----------
> > >  fs/xfs/xfs_verity.h |  8 +++++
> > >  2 files changed, 66 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
> > > index a9874ff4efcd..ef0aff216f06 100644
> > > --- a/fs/xfs/xfs_verity.c
> > > +++ b/fs/xfs/xfs_verity.c
> > > @@ -134,6 +134,10 @@ xfs_read_merkle_tree_page(
> > >  	struct page		*page = NULL;
> > >  	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
> > >  	uint32_t		bs = 1 << log_blocksize;
> > > +	int			blocks_per_page =
> > > +		(1 << (PAGE_SHIFT - log_blocksize));
> > > +	int			n = 0;
> > > +	int			offset = 0;
> > >  	struct xfs_da_args	args = {
> > >  		.dp		= ip,
> > >  		.attr_filter	= XFS_ATTR_VERITY,
> > > @@ -143,26 +147,59 @@ xfs_read_merkle_tree_page(
> > >  		.valuelen	= bs,
> > >  	};
> > >  	int			error = 0;
> > > +	bool			is_checked = true;
> > > +	struct xfs_verity_buf_list	*buf_list;
> > >  
> > >  	page = alloc_page(GFP_KERNEL);
> > >  	if (!page)
> > >  		return ERR_PTR(-ENOMEM);
> > >  
> > > -	error = xfs_attr_get(&args);
> > > -	if (error) {
> > > -		kmem_free(args.value);
> > > -		xfs_buf_rele(args.bp);
> > > +	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
> > > +	if (!buf_list) {
> > >  		put_page(page);
> > > -		return ERR_PTR(-EFAULT);
> > > +		return ERR_PTR(-ENOMEM);
> > >  	}
> > >  
> > > -	if (args.bp->b_flags & XBF_VERITY_CHECKED)
> > > +	/*
> > > +	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
> > > +	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
> > > +	 * PAGE SIZE
> > > +	 */
> > > +	for (n = 0; n < blocks_per_page; n++) {
> > > +		offset = bs * n;
> > > +		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
> > > +		args.name = (const uint8_t *)&name;
> > > +
> > > +		error = xfs_attr_get(&args);
> > > +		if (error) {
> > > +			kmem_free(args.value);
> > > +			/*
> > > +			 * No more Merkle tree blocks (e.g. this was the last
> > > +			 * block of the tree)
> > > +			 */
> > > +			if (error == -ENOATTR)
> > > +				break;
> > > +			xfs_buf_rele(args.bp);
> > > +			put_page(page);
> > > +			kmem_free(buf_list);
> > > +			return ERR_PTR(-EFAULT);
> > > +		}
> > > +
> > > +		buf_list->bufs[buf_list->buf_count++] = args.bp;
> > > +
> > > +		/* One of the buffers was dropped */
> > > +		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
> > > +			is_checked = false;
> > > +
> > > +		memcpy(page_address(page) + offset, args.value, args.valuelen);
> > > +		kmem_free(args.value);
> > > +		args.value = NULL;
> > > +	}
> > 
> > I was really hoping for a solution where the cached data can be used directly,
> > instead allocating a temporary page and copying the cached data into it every
> > time the cache is accessed.  The problem with what you have now is that every
> > time a single 32-byte hash is accessed, a full page (potentially 64KB!) will be
> > allocated and filled.  That's not very efficient.  The need to allocate the
> > temporary page can also cause ENOMEM (which will get reported as EIO).
> > 
> > Did you consider alternatives that would work more efficiently?  I think it
> > would be worth designing something that works properly with how XFS is planned
> > to cache the Merkle tree, instead of designing a workaround.
> > ->read_merkle_tree_page was not really designed for what you are doing here.
> > 
> > How about replacing ->read_merkle_tree_page with a function that takes in a
> > Merkle tree block index (not a page index!) and hands back a (page, offset) pair
> > that identifies where the Merkle tree block's data is located?  Or (folio,
> > offset), I suppose.

{kaddr, len}, please.

> > 
> > With that, would it be possible to directly return the XFS cache?
> > 
> > - Eric
> > 
> 
> Yeah, I also don't like it, I didn't want to change fs-verity much
> so went with this workaround. But as it's ok, I will look into trying
> to pass xfs buffers to fs-verity without direct use of
> ->read_merkle_tree_page(). I think it's possible with (folio,
> offset), the xfs buffers aren't xattr value align so the 4k merkle
> tree block is stored in two pages.

I don't think this is necessary to actually merge the code. We want
it to work correctly as the primary concern, performance is a
secondary concern.

Regardless, as you mention, the xfs_buf is not made up of contiguous
pages so the merkle tree block data will be split across two
(or more) pages.  AFAICT, the fsverity code doesn't work with data
structures that span multiple disjoint pages...

Another problem is that the xfs-buf might be backed by heap memory
(e.g. 4kB fs block size on 64kB PAGE_SIZE) and so it cannot be
treated like a page cache page by the fsverity merkle tree code. We
most definitely do not want to be passing pages containing heap
memory to functions expecting to be passed lru-resident page cache
pages....

That said, xfs-bufs do have a stable method of addressing the data
in the buffers, and all the XFS code uses this to access and
manipulate data directly in the buffers.

That is, xfs_buf_offset() returns a mapped kaddr that points to the
contiguous memory region containing the metadata in the buffer.  If
the xfs_buf spans multiple pages, it will return a kaddr pointing
into the contiguous vmapped memory address that maps the entire
buffer data range. If it is heap memory, it simply returns a pointer
into that heap memory. If it's a single page, then it returns the
kaddr for the data within the page.

If you move all the assumptions about how the merkle tree data is
managed out of fsverity and require the fielsystems to do the
mapping to kaddrs and reference counting to guarantee life times,
then the need for multiple different methods for reading merkle tree
data go away...

Cheers,

Dave.
Eric Biggers April 5, 2023, 10:54 p.m. UTC | #9
On Thu, Apr 06, 2023 at 08:26:46AM +1000, Dave Chinner wrote:
> > We could certainly think about moving to a design where fs/verity/ asks the
> > filesystem to just *read* a Merkle tree block, without adding it to a cache, and
> > then fs/verity/ implements the caching itself.  That would require some large
> > changes to each filesystem, though, unless we were to double-cache the Merkle
> > tree blocks which would be inefficient.
> 
> No, that's unnecessary.
> 
> All we need if for fsverity to require filesystems to pass it byte
> addressable data buffers that are externally reference counted. The
> filesystem can take a page reference before mapping the page and
> passing the kaddr to fsverity, then unmap and drop the reference
> when the merkle tree walk is done as per Andrey's new drop callout.
> 
> fsverity doesn't need to care what the buffer is made from, how it
> is cached, what it's life cycle is, etc. The caching mechanism and
> reference counting is entirely controlled by the filesystem callout
> implementations, and fsverity only needs to deal with memory buffers
> that are guaranteed to live for the entire walk of the merkle
> tree....

Sure.  Just a couple notes:

First, fs/verity/ does still need to be able to tell whether the buffer is newly
instantiated or not.

Second, fs/verity/ uses the ahash API to do the hashing.  ahash is a
scatterlist-based API.  Virtual addresses can still be used (see sg_set_buf()),
but the memory cannot be vmalloc'ed memory, since virt_to_page() needs to work.
Does XFS use vmalloc'ed memory for these buffers?

BTW, converting fs/verity/ from ahash to shash is an option; I've really never
been a fan of the scatterlist-based crypto APIs!  The disadvantage of doing
this, though, would be that it would remove support for all the hardware crypto
drivers.  That *might* actually be okay, as that approach to crypto acceleration
has mostly fallen out of favor, in favor of CPU-based acceleration.  But I do
worry about e.g. someone coming out of the woodwork and saying they need to use
fsverity on a low-powered ARM board that has a crypto accelerator like CAAM, and
they MUST use their crypto accelerator to get acceptable performance.

- Eric
Dave Chinner April 5, 2023, 11:37 p.m. UTC | #10
On Wed, Apr 05, 2023 at 10:54:06PM +0000, Eric Biggers wrote:
> On Thu, Apr 06, 2023 at 08:26:46AM +1000, Dave Chinner wrote:
> > > We could certainly think about moving to a design where fs/verity/ asks the
> > > filesystem to just *read* a Merkle tree block, without adding it to a cache, and
> > > then fs/verity/ implements the caching itself.  That would require some large
> > > changes to each filesystem, though, unless we were to double-cache the Merkle
> > > tree blocks which would be inefficient.
> > 
> > No, that's unnecessary.
> > 
> > All we need if for fsverity to require filesystems to pass it byte
> > addressable data buffers that are externally reference counted. The
> > filesystem can take a page reference before mapping the page and
> > passing the kaddr to fsverity, then unmap and drop the reference
> > when the merkle tree walk is done as per Andrey's new drop callout.
> > 
> > fsverity doesn't need to care what the buffer is made from, how it
> > is cached, what it's life cycle is, etc. The caching mechanism and
> > reference counting is entirely controlled by the filesystem callout
> > implementations, and fsverity only needs to deal with memory buffers
> > that are guaranteed to live for the entire walk of the merkle
> > tree....
> 
> Sure.  Just a couple notes:
> 
> First, fs/verity/ does still need to be able to tell whether the buffer is newly
> instantiated or not.

Boolean flag from the caller.

> Second, fs/verity/ uses the ahash API to do the hashing.  ahash is a
> scatterlist-based API.  Virtual addresses can still be used (see sg_set_buf()),
> but the memory cannot be vmalloc'ed memory, since virt_to_page() needs to work.
> Does XFS use vmalloc'ed memory for these buffers?

Not vmalloc'ed, but vmapped. we allocate the pages individually, but
then call vm_map_page() to present the higher level code with a
single contiguous memory range if it is a multi-page buffer.

We do have the backing info held in the buffer, and that's what we
use for IO. If fsverity needs a page based scatter/gather list
for hardware offload, it could ask the filesystem to provide it
for that given buffer...

> BTW, converting fs/verity/ from ahash to shash is an option; I've really never
> been a fan of the scatterlist-based crypto APIs!  The disadvantage of doing
> this, though, would be that it would remove support for all the hardware crypto
> drivers.
>
> That *might* actually be okay, as that approach to crypto acceleration
> has mostly fallen out of favor, in favor of CPU-based acceleration.  But I do
> worry about e.g. someone coming out of the woodwork and saying they need to use
> fsverity on a low-powered ARM board that has a crypto accelerator like CAAM, and
> they MUST use their crypto accelerator to get acceptable performance.

True, but we are very unlikely to be using XFS on such small
systems and I don't think we really care about XFS performance on
android sized systems, either.

Cheers,

Dave.
Eric Biggers April 6, 2023, 12:44 a.m. UTC | #11
On Thu, Apr 06, 2023 at 09:37:53AM +1000, Dave Chinner wrote:
> On Wed, Apr 05, 2023 at 10:54:06PM +0000, Eric Biggers wrote:
> > On Thu, Apr 06, 2023 at 08:26:46AM +1000, Dave Chinner wrote:
> > > > We could certainly think about moving to a design where fs/verity/ asks the
> > > > filesystem to just *read* a Merkle tree block, without adding it to a cache, and
> > > > then fs/verity/ implements the caching itself.  That would require some large
> > > > changes to each filesystem, though, unless we were to double-cache the Merkle
> > > > tree blocks which would be inefficient.
> > > 
> > > No, that's unnecessary.
> > > 
> > > All we need if for fsverity to require filesystems to pass it byte
> > > addressable data buffers that are externally reference counted. The
> > > filesystem can take a page reference before mapping the page and
> > > passing the kaddr to fsverity, then unmap and drop the reference
> > > when the merkle tree walk is done as per Andrey's new drop callout.
> > > 
> > > fsverity doesn't need to care what the buffer is made from, how it
> > > is cached, what it's life cycle is, etc. The caching mechanism and
> > > reference counting is entirely controlled by the filesystem callout
> > > implementations, and fsverity only needs to deal with memory buffers
> > > that are guaranteed to live for the entire walk of the merkle
> > > tree....
> > 
> > Sure.  Just a couple notes:
> > 
> > First, fs/verity/ does still need to be able to tell whether the buffer is newly
> > instantiated or not.
> 
> Boolean flag from the caller.
> 
> > Second, fs/verity/ uses the ahash API to do the hashing.  ahash is a
> > scatterlist-based API.  Virtual addresses can still be used (see sg_set_buf()),
> > but the memory cannot be vmalloc'ed memory, since virt_to_page() needs to work.
> > Does XFS use vmalloc'ed memory for these buffers?
> 
> Not vmalloc'ed, but vmapped. we allocate the pages individually, but
> then call vm_map_page() to present the higher level code with a
> single contiguous memory range if it is a multi-page buffer.
> 
> We do have the backing info held in the buffer, and that's what we
> use for IO. If fsverity needs a page based scatter/gather list
> for hardware offload, it could ask the filesystem to provide it
> for that given buffer...
> 
> > BTW, converting fs/verity/ from ahash to shash is an option; I've really never
> > been a fan of the scatterlist-based crypto APIs!  The disadvantage of doing
> > this, though, would be that it would remove support for all the hardware crypto
> > drivers.
> >
> > That *might* actually be okay, as that approach to crypto acceleration
> > has mostly fallen out of favor, in favor of CPU-based acceleration.  But I do
> > worry about e.g. someone coming out of the woodwork and saying they need to use
> > fsverity on a low-powered ARM board that has a crypto accelerator like CAAM, and
> > they MUST use their crypto accelerator to get acceptable performance.
> 
> True, but we are very unlikely to be using XFS on such small
> systems and I don't think we really care about XFS performance on
> android sized systems, either.
> 

FYI, I've sent an RFC patch that converts fs/verity/ from ahash to shash:
https://lore.kernel.org/r/20230406003714.94580-1-ebiggers@kernel.org

It would be great if we could do that.  But I need to get a better sense for
whether anyone will complain...

- Eric
Eric Biggers April 7, 2023, 7:56 p.m. UTC | #12
On Wed, Apr 05, 2023 at 05:44:36PM -0700, Eric Biggers wrote:
> > Not vmalloc'ed, but vmapped. we allocate the pages individually, but
> > then call vm_map_page() to present the higher level code with a
> > single contiguous memory range if it is a multi-page buffer.
> > 
> > We do have the backing info held in the buffer, and that's what we
> > use for IO. If fsverity needs a page based scatter/gather list
> > for hardware offload, it could ask the filesystem to provide it
> > for that given buffer...
> > 
> > > BTW, converting fs/verity/ from ahash to shash is an option; I've really never
> > > been a fan of the scatterlist-based crypto APIs!  The disadvantage of doing
> > > this, though, would be that it would remove support for all the hardware crypto
> > > drivers.
> > >
> > > That *might* actually be okay, as that approach to crypto acceleration
> > > has mostly fallen out of favor, in favor of CPU-based acceleration.  But I do
> > > worry about e.g. someone coming out of the woodwork and saying they need to use
> > > fsverity on a low-powered ARM board that has a crypto accelerator like CAAM, and
> > > they MUST use their crypto accelerator to get acceptable performance.
> > 
> > True, but we are very unlikely to be using XFS on such small
> > systems and I don't think we really care about XFS performance on
> > android sized systems, either.
> > 
> 
> FYI, I've sent an RFC patch that converts fs/verity/ from ahash to shash:
> https://lore.kernel.org/r/20230406003714.94580-1-ebiggers@kernel.org
> 
> It would be great if we could do that.  But I need to get a better sense for
> whether anyone will complain...

FWIW, dm-verity went in the other direction.  It started with shash, and then in
2017 it was switched to ahash by https://git.kernel.org/linus/d1ac3ff008fb9a48
("dm verity: switch to using asynchronous hash crypto API").

I think that was part of my motivation for using ahash in fsverity from the
beginning.

Still, it does seem that ahash is more trouble than it's worth these days...

- Eric
diff mbox series

Patch

diff --git a/fs/xfs/xfs_verity.c b/fs/xfs/xfs_verity.c
index a9874ff4efcd..ef0aff216f06 100644
--- a/fs/xfs/xfs_verity.c
+++ b/fs/xfs/xfs_verity.c
@@ -134,6 +134,10 @@  xfs_read_merkle_tree_page(
 	struct page		*page = NULL;
 	__be64			name = cpu_to_be64(index << PAGE_SHIFT);
 	uint32_t		bs = 1 << log_blocksize;
+	int			blocks_per_page =
+		(1 << (PAGE_SHIFT - log_blocksize));
+	int			n = 0;
+	int			offset = 0;
 	struct xfs_da_args	args = {
 		.dp		= ip,
 		.attr_filter	= XFS_ATTR_VERITY,
@@ -143,26 +147,59 @@  xfs_read_merkle_tree_page(
 		.valuelen	= bs,
 	};
 	int			error = 0;
+	bool			is_checked = true;
+	struct xfs_verity_buf_list	*buf_list;
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	error = xfs_attr_get(&args);
-	if (error) {
-		kmem_free(args.value);
-		xfs_buf_rele(args.bp);
+	buf_list = kzalloc(sizeof(struct xfs_verity_buf_list), GFP_KERNEL);
+	if (!buf_list) {
 		put_page(page);
-		return ERR_PTR(-EFAULT);
+		return ERR_PTR(-ENOMEM);
 	}
 
-	if (args.bp->b_flags & XBF_VERITY_CHECKED)
+	/*
+	 * Fill the page with Merkle tree blocks. The blcoks_per_page is higher
+	 * than 1 when fs block size != PAGE_SIZE or Merkle tree block size !=
+	 * PAGE SIZE
+	 */
+	for (n = 0; n < blocks_per_page; n++) {
+		offset = bs * n;
+		name = cpu_to_be64(((index << PAGE_SHIFT) + offset));
+		args.name = (const uint8_t *)&name;
+
+		error = xfs_attr_get(&args);
+		if (error) {
+			kmem_free(args.value);
+			/*
+			 * No more Merkle tree blocks (e.g. this was the last
+			 * block of the tree)
+			 */
+			if (error == -ENOATTR)
+				break;
+			xfs_buf_rele(args.bp);
+			put_page(page);
+			kmem_free(buf_list);
+			return ERR_PTR(-EFAULT);
+		}
+
+		buf_list->bufs[buf_list->buf_count++] = args.bp;
+
+		/* One of the buffers was dropped */
+		if (!(args.bp->b_flags & XBF_VERITY_CHECKED))
+			is_checked = false;
+
+		memcpy(page_address(page) + offset, args.value, args.valuelen);
+		kmem_free(args.value);
+		args.value = NULL;
+	}
+
+	if (is_checked)
 		SetPageChecked(page);
+	page->private = (unsigned long)buf_list;
 
-	page->private = (unsigned long)args.bp;
-	memcpy(page_address(page), args.value, args.valuelen);
-
-	kmem_free(args.value);
 	return page;
 }
 
@@ -191,16 +228,21 @@  xfs_write_merkle_tree_block(
 
 static void
 xfs_drop_page(
-	struct page	*page)
+	struct page			*page)
 {
-	struct xfs_buf *buf = (struct xfs_buf *)page->private;
+	int				i = 0;
+	struct xfs_verity_buf_list	*buf_list =
+		(struct xfs_verity_buf_list *)page->private;
 
-	ASSERT(buf != NULL);
+	ASSERT(buf_list != NULL);
 
-	if (PageChecked(page))
-		buf->b_flags |= XBF_VERITY_CHECKED;
+	for (i = 0; i < buf_list->buf_count; i++) {
+		if (PageChecked(page))
+			buf_list->bufs[i]->b_flags |= XBF_VERITY_CHECKED;
+		xfs_buf_rele(buf_list->bufs[i]);
+	}
 
-	xfs_buf_rele(buf);
+	kmem_free(buf_list);
 	put_page(page);
 }
 
diff --git a/fs/xfs/xfs_verity.h b/fs/xfs/xfs_verity.h
index ae5d87ca32a8..433b2f4ae3bc 100644
--- a/fs/xfs/xfs_verity.h
+++ b/fs/xfs/xfs_verity.h
@@ -16,4 +16,12 @@  extern const struct fsverity_operations xfs_verity_ops;
 #define xfs_verity_ops NULL
 #endif	/* CONFIG_FS_VERITY */
 
+/* Minimal Merkle tree block size is 1024 */
+#define XFS_VERITY_MAX_MBLOCKS_PER_PAGE (1 << (PAGE_SHIFT - 10))
+
+struct xfs_verity_buf_list {
+	unsigned int	buf_count;
+	struct xfs_buf	*bufs[XFS_VERITY_MAX_MBLOCKS_PER_PAGE];
+};
+
 #endif	/* __XFS_VERITY_H__ */