diff mbox series

[1/3] xfs: fix sparse inode limits on runt AG

Message ID 20241112221920.1105007-2-david@fromorbit.com (mailing list archive)
State Under Review
Headers show
Series xfs: miscellaneous bug fixes | expand

Commit Message

Dave Chinner Nov. 12, 2024, 10:05 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

The runt AG at the end of a filesystem is almost always smaller than
the mp->m_sb.sb_agblocks. Unfortunately, when setting the max_agbno
limit for the inode chunk allocation, we do not take this into
account. This means we can allocate a sparse inode chunk that
overlaps beyond the end of an AG. When we go to allocate an inode
from that sparse chunk, the irec fails validation because the
agbno of the start of the irec is beyond valid limits for the runt
AG.

Prevent this from happening by taking into account the size of the
runt AG when allocating inode chunks. Also convert the various
checks for valid inode chunk agbnos to use xfs_ag_block_count()
so that they will also catch such issues in the future.

Fixes: 56d1115c9bc7 ("xfs: allocate sparse inode chunks on full chunk allocation failure")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_ialloc.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

Comments

Darrick J. Wong Nov. 12, 2024, 11:15 p.m. UTC | #1
On Wed, Nov 13, 2024 at 09:05:14AM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> The runt AG at the end of a filesystem is almost always smaller than
> the mp->m_sb.sb_agblocks. Unfortunately, when setting the max_agbno
> limit for the inode chunk allocation, we do not take this into
> account. This means we can allocate a sparse inode chunk that
> overlaps beyond the end of an AG. When we go to allocate an inode
> from that sparse chunk, the irec fails validation because the
> agbno of the start of the irec is beyond valid limits for the runt
> AG.
> 
> Prevent this from happening by taking into account the size of the
> runt AG when allocating inode chunks. Also convert the various
> checks for valid inode chunk agbnos to use xfs_ag_block_count()
> so that they will also catch such issues in the future.
> 
> Fixes: 56d1115c9bc7 ("xfs: allocate sparse inode chunks on full chunk allocation failure")

Cc: <stable@vger.kernel.org> # v4.2

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_ialloc.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> index 271855227514..6258527315f2 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.c
> +++ b/fs/xfs/libxfs/xfs_ialloc.c
> @@ -855,7 +855,8 @@ xfs_ialloc_ag_alloc(
>  		 * the end of the AG.
>  		 */
>  		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
> -		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
> +		args.max_agbno = round_down(xfs_ag_block_count(args.mp,
> +							pag->pag_agno),

So if I'm reading this right, the inode cluster allocation checks now
enforce that we cannot search for free space beyond the actual end of
the AG, rounded down per inode alignment rules?

In that case, can this use the cached ag block count:

		args.max_agbno = round_down(
					pag_group(pag)->xg_block_count,
					args.mp->m_sb.sb_inoalignmt);

rather than recomputing the block count every time?

>  					    args.mp->m_sb.sb_inoalignmt) -
>  				 igeo->ialloc_blks;
>  
> @@ -2332,9 +2333,9 @@ xfs_difree(
>  		return -EINVAL;
>  	}
>  	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
> -	if (agbno >= mp->m_sb.sb_agblocks)  {
> -		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
> -			__func__, agbno, mp->m_sb.sb_agblocks);
> +	if (agbno >= xfs_ag_block_count(mp, pag->pag_agno)) {
> +		xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
> +			__func__, agbno, xfs_ag_block_count(mp, pag->pag_agno));

and everywhere else too?

(The logic itself looks ok)

--D

>  		ASSERT(0);
>  		return -EINVAL;
>  	}
> @@ -2457,7 +2458,7 @@ xfs_imap(
>  	 */
>  	agino = XFS_INO_TO_AGINO(mp, ino);
>  	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
> -	if (agbno >= mp->m_sb.sb_agblocks ||
> +	if (agbno >= xfs_ag_block_count(mp, pag->pag_agno) ||
>  	    ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
>  		error = -EINVAL;
>  #ifdef DEBUG
> @@ -2467,11 +2468,12 @@ xfs_imap(
>  		 */
>  		if (flags & XFS_IGET_UNTRUSTED)
>  			return error;
> -		if (agbno >= mp->m_sb.sb_agblocks) {
> +		if (agbno >= xfs_ag_block_count(mp, pag->pag_agno)) {
>  			xfs_alert(mp,
>  		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
>  				__func__, (unsigned long long)agbno,
> -				(unsigned long)mp->m_sb.sb_agblocks);
> +				(unsigned long)xfs_ag_block_count(mp,
> +							pag->pag_agno));
>  		}
>  		if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
>  			xfs_alert(mp,
> -- 
> 2.45.2
> 
>
Dave Chinner Nov. 13, 2024, 12:12 a.m. UTC | #2
On Tue, Nov 12, 2024 at 03:15:39PM -0800, Darrick J. Wong wrote:
> On Wed, Nov 13, 2024 at 09:05:14AM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > The runt AG at the end of a filesystem is almost always smaller than
> > the mp->m_sb.sb_agblocks. Unfortunately, when setting the max_agbno
> > limit for the inode chunk allocation, we do not take this into
> > account. This means we can allocate a sparse inode chunk that
> > overlaps beyond the end of an AG. When we go to allocate an inode
> > from that sparse chunk, the irec fails validation because the
> > agbno of the start of the irec is beyond valid limits for the runt
> > AG.
> > 
> > Prevent this from happening by taking into account the size of the
> > runt AG when allocating inode chunks. Also convert the various
> > checks for valid inode chunk agbnos to use xfs_ag_block_count()
> > so that they will also catch such issues in the future.
> > 
> > Fixes: 56d1115c9bc7 ("xfs: allocate sparse inode chunks on full chunk allocation failure")
> 
> Cc: <stable@vger.kernel.org> # v4.2
> 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/libxfs/xfs_ialloc.c | 16 +++++++++-------
> >  1 file changed, 9 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> > index 271855227514..6258527315f2 100644
> > --- a/fs/xfs/libxfs/xfs_ialloc.c
> > +++ b/fs/xfs/libxfs/xfs_ialloc.c
> > @@ -855,7 +855,8 @@ xfs_ialloc_ag_alloc(
> >  		 * the end of the AG.
> >  		 */
> >  		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
> > -		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
> > +		args.max_agbno = round_down(xfs_ag_block_count(args.mp,
> > +							pag->pag_agno),
> 
> So if I'm reading this right, the inode cluster allocation checks now
> enforce that we cannot search for free space beyond the actual end of
> the AG, rounded down per inode alignment rules?
> 
> In that case, can this use the cached ag block count:
> 
> 		args.max_agbno = round_down(
> 					pag_group(pag)->xg_block_count,
> 					args.mp->m_sb.sb_inoalignmt);
> 
> rather than recomputing the block count every time?

Eventually, yes. I have another series that pushes the pag further
into these AG size checks all over the code to try to avoid this
entire class of bug in the future (i.e. blindly using mp->m_sb ag
parameters without considering the last AG is a runt).

I am waiting for the group rework to land
before I did any more work on that conversion. However, it is not
yet in the for-next branch, so I'm assuming that it isn't due to be
merged in the upcoming merge window because that's about to open
and none of that code has seen any time in linux-next of fs-next...

I want this fix to land sooner rather than later, so I used
xfs_ag_block_count() to avoid conflicts with pending work as well
as not require me to chase random dev branches to submit what is a
relatively simple bug fix....

-Dave.
Darrick J. Wong Nov. 13, 2024, 12:24 a.m. UTC | #3
On Wed, Nov 13, 2024 at 11:12:02AM +1100, Dave Chinner wrote:
> On Tue, Nov 12, 2024 at 03:15:39PM -0800, Darrick J. Wong wrote:
> > On Wed, Nov 13, 2024 at 09:05:14AM +1100, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > The runt AG at the end of a filesystem is almost always smaller than
> > > the mp->m_sb.sb_agblocks. Unfortunately, when setting the max_agbno
> > > limit for the inode chunk allocation, we do not take this into
> > > account. This means we can allocate a sparse inode chunk that
> > > overlaps beyond the end of an AG. When we go to allocate an inode
> > > from that sparse chunk, the irec fails validation because the
> > > agbno of the start of the irec is beyond valid limits for the runt
> > > AG.
> > > 
> > > Prevent this from happening by taking into account the size of the
> > > runt AG when allocating inode chunks. Also convert the various
> > > checks for valid inode chunk agbnos to use xfs_ag_block_count()
> > > so that they will also catch such issues in the future.
> > > 
> > > Fixes: 56d1115c9bc7 ("xfs: allocate sparse inode chunks on full chunk allocation failure")
> > 
> > Cc: <stable@vger.kernel.org> # v4.2
> > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/libxfs/xfs_ialloc.c | 16 +++++++++-------
> > >  1 file changed, 9 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> > > index 271855227514..6258527315f2 100644
> > > --- a/fs/xfs/libxfs/xfs_ialloc.c
> > > +++ b/fs/xfs/libxfs/xfs_ialloc.c
> > > @@ -855,7 +855,8 @@ xfs_ialloc_ag_alloc(
> > >  		 * the end of the AG.
> > >  		 */
> > >  		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
> > > -		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
> > > +		args.max_agbno = round_down(xfs_ag_block_count(args.mp,
> > > +							pag->pag_agno),
> > 
> > So if I'm reading this right, the inode cluster allocation checks now
> > enforce that we cannot search for free space beyond the actual end of
> > the AG, rounded down per inode alignment rules?
> > 
> > In that case, can this use the cached ag block count:
> > 
> > 		args.max_agbno = round_down(
> > 					pag_group(pag)->xg_block_count,
> > 					args.mp->m_sb.sb_inoalignmt);
> > 
> > rather than recomputing the block count every time?
> 
> Eventually, yes. I have another series that pushes the pag further
> into these AG size checks all over the code to try to avoid this
> entire class of bug in the future (i.e. blindly using mp->m_sb ag
> parameters without considering the last AG is a runt).
> 
> I am waiting for the group rework to land
> before I did any more work on that conversion. However, it is not
> yet in the for-next branch, so I'm assuming that it isn't due to be
> merged in the upcoming merge window because that's about to open
> and none of that code has seen any time in linux-next of fs-next...

...let's hope that slip doesn't happen. :(

> I want this fix to land sooner rather than later, so I used
> xfs_ag_block_count() to avoid conflicts with pending work as well
> as not require me to chase random dev branches to submit what is a
> relatively simple bug fix....

Aha, I wondered if that was why you were asking if cem was planning to
push things to for-next.  He said during office hours that he'd push the
metadir/rtgroups stuff Wednesday morning.

With the xfs_ag_block_count calls replaced if the generic groups rework
*does* land (or as it is now if it doesn't),

Reviewed-by: Darrick J. Wong <djwong@kernel.org>

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 271855227514..6258527315f2 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -855,7 +855,8 @@  xfs_ialloc_ag_alloc(
 		 * the end of the AG.
 		 */
 		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
-		args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+		args.max_agbno = round_down(xfs_ag_block_count(args.mp,
+							pag->pag_agno),
 					    args.mp->m_sb.sb_inoalignmt) -
 				 igeo->ialloc_blks;
 
@@ -2332,9 +2333,9 @@  xfs_difree(
 		return -EINVAL;
 	}
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-	if (agbno >= mp->m_sb.sb_agblocks)  {
-		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-			__func__, agbno, mp->m_sb.sb_agblocks);
+	if (agbno >= xfs_ag_block_count(mp, pag->pag_agno)) {
+		xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
+			__func__, agbno, xfs_ag_block_count(mp, pag->pag_agno));
 		ASSERT(0);
 		return -EINVAL;
 	}
@@ -2457,7 +2458,7 @@  xfs_imap(
 	 */
 	agino = XFS_INO_TO_AGINO(mp, ino);
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-	if (agbno >= mp->m_sb.sb_agblocks ||
+	if (agbno >= xfs_ag_block_count(mp, pag->pag_agno) ||
 	    ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
 		error = -EINVAL;
 #ifdef DEBUG
@@ -2467,11 +2468,12 @@  xfs_imap(
 		 */
 		if (flags & XFS_IGET_UNTRUSTED)
 			return error;
-		if (agbno >= mp->m_sb.sb_agblocks) {
+		if (agbno >= xfs_ag_block_count(mp, pag->pag_agno)) {
 			xfs_alert(mp,
 		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
 				__func__, (unsigned long long)agbno,
-				(unsigned long)mp->m_sb.sb_agblocks);
+				(unsigned long)xfs_ag_block_count(mp,
+							pag->pag_agno));
 		}
 		if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
 			xfs_alert(mp,