diff mbox series

[RFC,v2,14/14] xfs: enable block size larger than page size support

Message ID 20240213093713.1753368-15-kernel@pankajraghav.com (mailing list archive)
State New, archived
Headers show
Series enable bs > ps in XFS | expand

Commit Message

Pankaj Raghav (Samsung) Feb. 13, 2024, 9:37 a.m. UTC
From: Pankaj Raghav <p.raghav@samsung.com>

Page cache now has the ability to have a minimum order when allocating
a folio which is a prerequisite to add support for block size > page
size. Enable it in XFS under CONFIG_XFS_LBS.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
 fs/xfs/xfs_icache.c | 8 ++++++--
 fs/xfs/xfs_super.c  | 8 +++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

Comments

Darrick J. Wong Feb. 13, 2024, 4:20 p.m. UTC | #1
On Tue, Feb 13, 2024 at 10:37:13AM +0100, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> Page cache now has the ability to have a minimum order when allocating
> a folio which is a prerequisite to add support for block size > page
> size. Enable it in XFS under CONFIG_XFS_LBS.
> 
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
>  fs/xfs/xfs_icache.c | 8 ++++++--
>  fs/xfs/xfs_super.c  | 8 +++-----
>  2 files changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index dba514a2c84d..9de81caf7ad4 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -73,6 +73,7 @@ xfs_inode_alloc(
>  	xfs_ino_t		ino)
>  {
>  	struct xfs_inode	*ip;
> +	int			min_order = 0;
>  
>  	/*
>  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
> @@ -88,7 +89,8 @@ xfs_inode_alloc(
>  	/* VFS doesn't initialise i_mode or i_state! */
>  	VFS_I(ip)->i_mode = 0;
>  	VFS_I(ip)->i_state = 0;
> -	mapping_set_large_folios(VFS_I(ip)->i_mapping);
> +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> +	mapping_set_folio_orders(VFS_I(ip)->i_mapping, min_order, MAX_PAGECACHE_ORDER);
>  
>  	XFS_STATS_INC(mp, vn_active);
>  	ASSERT(atomic_read(&ip->i_pincount) == 0);
> @@ -313,6 +315,7 @@ xfs_reinit_inode(
>  	dev_t			dev = inode->i_rdev;
>  	kuid_t			uid = inode->i_uid;
>  	kgid_t			gid = inode->i_gid;
> +	int			min_order = 0;
>  
>  	error = inode_init_always(mp->m_super, inode);
>  
> @@ -323,7 +326,8 @@ xfs_reinit_inode(
>  	inode->i_rdev = dev;
>  	inode->i_uid = uid;
>  	inode->i_gid = gid;
> -	mapping_set_large_folios(inode->i_mapping);
> +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> +	mapping_set_folio_orders(inode->i_mapping, min_order, MAX_PAGECACHE_ORDER);

Twice now I've seen this, which makes me think "refactor this into a
single function."

But then, this is really just:

	mapping_set_folio_orders(inode->i_mapping,
			max(0, inode->i_sb->s_blocksize_bits - PAGE_SHIFT),
			MAX_PAGECACHE_ORDER);

Can we make that a generic inode_set_pagecache_orders helper?

>  	return error;
>  }
>  
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 5a2512d20bd0..6a3f0f6727eb 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1625,13 +1625,11 @@ xfs_fs_fill_super(
>  		goto out_free_sb;
>  	}
>  
> -	/*
> -	 * Until this is fixed only page-sized or smaller data blocks work.
> -	 */
> -	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
> +	if (!IS_ENABLED(CONFIG_XFS_LBS) && mp->m_sb.sb_blocksize > PAGE_SIZE) {
>  		xfs_warn(mp,
>  		"File system with blocksize %d bytes. "
> -		"Only pagesize (%ld) or less will currently work.",
> +		"Only pagesize (%ld) or less will currently work. "
> +		"Enable Experimental CONFIG_XFS_LBS for this support",
>  				mp->m_sb.sb_blocksize, PAGE_SIZE);

Please log a warning about the EXPERIMENTAL bs>ps feature being used
on this mount for the CONFIG_XFS_LBS=y case.

--D

>  		error = -ENOSYS;
>  		goto out_free_sb;
> -- 
> 2.43.0
> 
>
Dave Chinner Feb. 13, 2024, 9:34 p.m. UTC | #2
On Tue, Feb 13, 2024 at 10:37:13AM +0100, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> Page cache now has the ability to have a minimum order when allocating
> a folio which is a prerequisite to add support for block size > page
> size. Enable it in XFS under CONFIG_XFS_LBS.
> 
> Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
>  fs/xfs/xfs_icache.c | 8 ++++++--
>  fs/xfs/xfs_super.c  | 8 +++-----
>  2 files changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index dba514a2c84d..9de81caf7ad4 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -73,6 +73,7 @@ xfs_inode_alloc(
>  	xfs_ino_t		ino)
>  {
>  	struct xfs_inode	*ip;
> +	int			min_order = 0;
>  
>  	/*
>  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
> @@ -88,7 +89,8 @@ xfs_inode_alloc(
>  	/* VFS doesn't initialise i_mode or i_state! */
>  	VFS_I(ip)->i_mode = 0;
>  	VFS_I(ip)->i_state = 0;
> -	mapping_set_large_folios(VFS_I(ip)->i_mapping);
> +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> +	mapping_set_folio_orders(VFS_I(ip)->i_mapping, min_order, MAX_PAGECACHE_ORDER);

That's pretty nasty. You're using max() to hide underflow in the
subtraction to clamp the value to zero. And you don't need ilog2()
because we have the log of the block size in the superblock already.

	int			min_order = 0;
	.....
	if (mp->m_sb.sb_blocksize > PAGE_SIZE)
		min_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;

But, really why recalculate this -constant- on every inode
allocation?  That's a very hot path, so this should be set in the
M_IGEO(mp) structure (mp->m_ino_geo) at mount time and then the code
is simply:

	mapping_set_folio_orders(VFS_I(ip)->i_mapping,
			M_IGEO(mp)->min_folio_order, MAX_PAGECACHE_ORDER);

We already access the M_IGEO(mp) structure every inode allocation,
so there's little in way of additional cost here....

> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 5a2512d20bd0..6a3f0f6727eb 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1625,13 +1625,11 @@ xfs_fs_fill_super(
>  		goto out_free_sb;
>  	}
>  
> -	/*
> -	 * Until this is fixed only page-sized or smaller data blocks work.
> -	 */
> -	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
> +	if (!IS_ENABLED(CONFIG_XFS_LBS) && mp->m_sb.sb_blocksize > PAGE_SIZE) {
>  		xfs_warn(mp,
>  		"File system with blocksize %d bytes. "
> -		"Only pagesize (%ld) or less will currently work.",
> +		"Only pagesize (%ld) or less will currently work. "
> +		"Enable Experimental CONFIG_XFS_LBS for this support",
>  				mp->m_sb.sb_blocksize, PAGE_SIZE);
>  		error = -ENOSYS;
>  		goto out_free_sb;

This should just issue a warning if bs > ps.

	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
  		xfs_warn(mp,
"EXPERIMENTAL: Filesystem with Large Block Size (%d bytes) enabled.",
			mp->m_sb.sb_blocksize);
	}

-Dave.
Pankaj Raghav (Samsung) Feb. 14, 2024, 4:35 p.m. UTC | #3
> >  	struct xfs_inode	*ip;
> > +	int			min_order = 0;
> >  
> >  	/*
> >  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
> > @@ -88,7 +89,8 @@ xfs_inode_alloc(
> >  	/* VFS doesn't initialise i_mode or i_state! */
> >  	VFS_I(ip)->i_mode = 0;
> >  	VFS_I(ip)->i_state = 0;
> > -	mapping_set_large_folios(VFS_I(ip)->i_mapping);
> > +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> > +	mapping_set_folio_orders(VFS_I(ip)->i_mapping, min_order, MAX_PAGECACHE_ORDER);
> 
> That's pretty nasty. You're using max() to hide underflow in the
> subtraction to clamp the value to zero. And you don't need ilog2()
> because we have the log of the block size in the superblock already.
> 
> 	int			min_order = 0;
> 	.....
> 	if (mp->m_sb.sb_blocksize > PAGE_SIZE)
> 		min_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
how is it underflowing if I am comparing two values of type int?

> 
> But, really why recalculate this -constant- on every inode
> allocation?  That's a very hot path, so this should be set in the
> M_IGEO(mp) structure (mp->m_ino_geo) at mount time and then the code
> is simply:
> 
> 	mapping_set_folio_orders(VFS_I(ip)->i_mapping,
> 			M_IGEO(mp)->min_folio_order, MAX_PAGECACHE_ORDER);
> 

That is a good idea. I will add this change in the next revision.

> We already access the M_IGEO(mp) structure every inode allocation,
> so there's little in way of additional cost here....
> 
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 5a2512d20bd0..6a3f0f6727eb 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -1625,13 +1625,11 @@ xfs_fs_fill_super(
> >  		goto out_free_sb;
> >  	}
> >  
> > -	/*
> > -	 * Until this is fixed only page-sized or smaller data blocks work.
> > -	 */
> > -	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
> > +	if (!IS_ENABLED(CONFIG_XFS_LBS) && mp->m_sb.sb_blocksize > PAGE_SIZE) {
> >  		xfs_warn(mp,
> >  		"File system with blocksize %d bytes. "
> > -		"Only pagesize (%ld) or less will currently work.",
> > +		"Only pagesize (%ld) or less will currently work. "
> > +		"Enable Experimental CONFIG_XFS_LBS for this support",
> >  				mp->m_sb.sb_blocksize, PAGE_SIZE);
> >  		error = -ENOSYS;
> >  		goto out_free_sb;
> 
> This should just issue a warning if bs > ps.
> 
> 	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
>   		xfs_warn(mp,
> "EXPERIMENTAL: Filesystem with Large Block Size (%d bytes) enabled.",
> 			mp->m_sb.sb_blocksize);
> 	}

Yes! Luis already told me to add a warning here but I missed it before
sending the patches out.

> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Pankaj Raghav (Samsung) Feb. 14, 2024, 4:40 p.m. UTC | #4
> > @@ -323,7 +326,8 @@ xfs_reinit_inode(
> >  	inode->i_rdev = dev;
> >  	inode->i_uid = uid;
> >  	inode->i_gid = gid;
> > -	mapping_set_large_folios(inode->i_mapping);
> > +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> > +	mapping_set_folio_orders(inode->i_mapping, min_order, MAX_PAGECACHE_ORDER);
> 
> Twice now I've seen this, which makes me think "refactor this into a
> single function."
> 
> But then, this is really just:
> 
> 	mapping_set_folio_orders(inode->i_mapping,
> 			max(0, inode->i_sb->s_blocksize_bits - PAGE_SHIFT),
> 			MAX_PAGECACHE_ORDER);
> 
> Can we make that a generic inode_set_pagecache_orders helper?

Chinner suggested an alternative to stuff the min_order value in
mp->m_ino_geo. Then it will just be a call to:

mapping_set_folio_orders(VFS_I(ip)->i_mapping,
			M_IGEO(mp)->min_folio_order, MAX_PAGECACHE_ORDER);
> 
> >  	return error;
> >  }
> >  
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 5a2512d20bd0..6a3f0f6727eb 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -1625,13 +1625,11 @@ xfs_fs_fill_super(
> >  		goto out_free_sb;
> >  	}
> >  
> > -	/*
> > -	 * Until this is fixed only page-sized or smaller data blocks work.
> > -	 */
> > -	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
> > +	if (!IS_ENABLED(CONFIG_XFS_LBS) && mp->m_sb.sb_blocksize > PAGE_SIZE) {
> >  		xfs_warn(mp,
> >  		"File system with blocksize %d bytes. "
> > -		"Only pagesize (%ld) or less will currently work.",
> > +		"Only pagesize (%ld) or less will currently work. "
> > +		"Enable Experimental CONFIG_XFS_LBS for this support",
> >  				mp->m_sb.sb_blocksize, PAGE_SIZE);
> 
> Please log a warning about the EXPERIMENTAL bs>ps feature being used
> on this mount for the CONFIG_XFS_LBS=y case.
> 
Yes! I will do it as a part of the next revision.
Dave Chinner Feb. 15, 2024, 10:17 p.m. UTC | #5
On Wed, Feb 14, 2024 at 05:35:49PM +0100, Pankaj Raghav (Samsung) wrote:
> > >  	struct xfs_inode	*ip;
> > > +	int			min_order = 0;
> > >  
> > >  	/*
> > >  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
> > > @@ -88,7 +89,8 @@ xfs_inode_alloc(
> > >  	/* VFS doesn't initialise i_mode or i_state! */
> > >  	VFS_I(ip)->i_mode = 0;
> > >  	VFS_I(ip)->i_state = 0;
> > > -	mapping_set_large_folios(VFS_I(ip)->i_mapping);
> > > +	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
> > > +	mapping_set_folio_orders(VFS_I(ip)->i_mapping, min_order, MAX_PAGECACHE_ORDER);
> > 
> > That's pretty nasty. You're using max() to hide underflow in the
> > subtraction to clamp the value to zero. And you don't need ilog2()
> > because we have the log of the block size in the superblock already.
> > 
> > 	int			min_order = 0;
> > 	.....
> > 	if (mp->m_sb.sb_blocksize > PAGE_SIZE)
> > 		min_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
> how is it underflowing if I am comparing two values of type int?

Folio order is supposed to be unsigned. Negative orders are not
valid values.  So you're hacking around an unsigned underflow by
using signed ints, then hiding the fact that unsigned subtraction
would underflow check behind a max(0, underflowing calc) construct
that works only because you're using signed ints rather than
unsigned ints for the order.

It also implicitly relies on the max_order being zero at that point
in time, so if we change the value of max order in future before
this check, this check may not fuction correctly in future.

Please: use unsigned ints for order, and explicitly write the
code so it doesn't ever need negative values that could underflow.

-Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index dba514a2c84d..9de81caf7ad4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -73,6 +73,7 @@  xfs_inode_alloc(
 	xfs_ino_t		ino)
 {
 	struct xfs_inode	*ip;
+	int			min_order = 0;
 
 	/*
 	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
@@ -88,7 +89,8 @@  xfs_inode_alloc(
 	/* VFS doesn't initialise i_mode or i_state! */
 	VFS_I(ip)->i_mode = 0;
 	VFS_I(ip)->i_state = 0;
-	mapping_set_large_folios(VFS_I(ip)->i_mapping);
+	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
+	mapping_set_folio_orders(VFS_I(ip)->i_mapping, min_order, MAX_PAGECACHE_ORDER);
 
 	XFS_STATS_INC(mp, vn_active);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -313,6 +315,7 @@  xfs_reinit_inode(
 	dev_t			dev = inode->i_rdev;
 	kuid_t			uid = inode->i_uid;
 	kgid_t			gid = inode->i_gid;
+	int			min_order = 0;
 
 	error = inode_init_always(mp->m_super, inode);
 
@@ -323,7 +326,8 @@  xfs_reinit_inode(
 	inode->i_rdev = dev;
 	inode->i_uid = uid;
 	inode->i_gid = gid;
-	mapping_set_large_folios(inode->i_mapping);
+	min_order = max(min_order, ilog2(mp->m_sb.sb_blocksize) - PAGE_SHIFT);
+	mapping_set_folio_orders(inode->i_mapping, min_order, MAX_PAGECACHE_ORDER);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5a2512d20bd0..6a3f0f6727eb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1625,13 +1625,11 @@  xfs_fs_fill_super(
 		goto out_free_sb;
 	}
 
-	/*
-	 * Until this is fixed only page-sized or smaller data blocks work.
-	 */
-	if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
+	if (!IS_ENABLED(CONFIG_XFS_LBS) && mp->m_sb.sb_blocksize > PAGE_SIZE) {
 		xfs_warn(mp,
 		"File system with blocksize %d bytes. "
-		"Only pagesize (%ld) or less will currently work.",
+		"Only pagesize (%ld) or less will currently work. "
+		"Enable Experimental CONFIG_XFS_LBS for this support",
 				mp->m_sb.sb_blocksize, PAGE_SIZE);
 		error = -ENOSYS;
 		goto out_free_sb;