diff mbox series

xfs: ensure inobt record walks always make forward progress

Message ID 20201114191446.GR9695@magnolia (mailing list archive)
State Superseded, archived
Headers show
Series xfs: ensure inobt record walks always make forward progress | expand

Commit Message

Darrick J. Wong Nov. 14, 2020, 7:14 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

The aim of the inode btree record iterator function is to call a
callback on every record in the btree.  To avoid having to tear down and
recreate the inode btree cursor around every callback, it caches a
certain number of records in a memory buffer.  After each batch of
callback invocations, we have to perform a btree lookup to find the
next record after where we left off.

However, if the keys of the inode btree are corrupt, the lookup might
put us in the wrong part of the inode btree, causing the walk function
to loop forever.  Therefore, we add extra cursor tracking to make sure
that we never go backwards neither when performing the lookup nor when
jumping to the next inobt record.  This also fixes an off by one error
where upon resume the lookup should have been for the inode /after/ the
point at which we stopped.

Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat
and quotacheck to hang.

Fixes: a211432c27ff ("xfs: create simplified inode walk function")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_iwalk.c |   26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

Comments

Chandan Babu R Nov. 17, 2020, 7:13 a.m. UTC | #1
On Sunday 15 November 2020 12:44:46 AM IST Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> The aim of the inode btree record iterator function is to call a
> callback on every record in the btree.  To avoid having to tear down and
> recreate the inode btree cursor around every callback, it caches a
> certain number of records in a memory buffer.  After each batch of
> callback invocations, we have to perform a btree lookup to find the
> next record after where we left off.
> 
> However, if the keys of the inode btree are corrupt, the lookup might
> put us in the wrong part of the inode btree, causing the walk function
> to loop forever.  Therefore, we add extra cursor tracking to make sure
> that we never go backwards neither when performing the lookup nor when
> jumping to the next inobt record.  This also fixes an off by one error
> where upon resume the lookup should have been for the inode /after/ the
> point at which we stopped.
> 
> Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat
> and quotacheck to hang.
> 
> Fixes: a211432c27ff ("xfs: create simplified inode walk function")
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_iwalk.c |   26 +++++++++++++++++++++++---
>  1 file changed, 23 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> index 233dcc8784db..889ed867670c 100644
> --- a/fs/xfs/xfs_iwalk.c
> +++ b/fs/xfs/xfs_iwalk.c
> @@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
>  	/* Where do we start the traversal? */
>  	xfs_ino_t			startino;
>  
> +	/* What was the last inode number we saw when iterating the inobt? */
> +	xfs_ino_t			lastino;
> +
>  	/* Array of inobt records we cache. */
>  	struct xfs_inobt_rec_incore	*recs;
>  
> @@ -214,6 +217,8 @@ xfs_iwalk_ag_recs(
>  				return error;
>  		}
>  	}
> +	iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
> +				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);

The above is not required since lastino is already updated by xfs_iwalk_ag()
for each inobt record it comes across. Also, 'irec' is being used outside of
the scope of its declaration resulting in a compilation error.

>  
>  	return 0;
>  }
> @@ -347,15 +352,17 @@ xfs_iwalk_run_callbacks(
>  	struct xfs_mount		*mp = iwag->mp;
>  	struct xfs_trans		*tp = iwag->tp;
>  	struct xfs_inobt_rec_incore	*irec;
> -	xfs_agino_t			restart;
> +	xfs_agino_t			next_agino;
>  	int				error;
>  
> +	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
> +
>  	ASSERT(iwag->nr_recs > 0);
>  
>  	/* Delete cursor but remember the last record we cached... */
>  	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
>  	irec = &iwag->recs[iwag->nr_recs - 1];
> -	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
> +	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
>  
>  	error = xfs_iwalk_ag_recs(iwag);
>  	if (error)
> @@ -372,7 +379,7 @@ xfs_iwalk_run_callbacks(
>  	if (error)
>  		return error;
>  
> -	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
> +	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
>  }
>  
>  /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
> @@ -396,6 +403,7 @@ xfs_iwalk_ag(
>  
>  	while (!error && has_more) {
>  		struct xfs_inobt_rec_incore	*irec;
> +		xfs_ino_t			rec_fsino;
>  
>  		cond_resched();
>  		if (xfs_pwork_want_abort(&iwag->pwork))
> @@ -407,6 +415,15 @@ xfs_iwalk_ag(
>  		if (error || !has_more)
>  			break;
>  
> +		/* Make sure that we always move forward. */
> +		rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
> +		if (iwag->lastino != NULLFSINO &&
> +		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
> +			error = -EFSCORRUPTED;
> +			goto out;
> +		}
> +		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
> +
>  		/* No allocated inodes in this chunk; skip it. */
>  		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
>  			error = xfs_btree_increment(cur, 0, &has_more);
> @@ -535,6 +552,7 @@ xfs_iwalk(
>  		.trim_start	= 1,
>  		.skip_empty	= 1,
>  		.pwork		= XFS_PWORK_SINGLE_THREADED,
> +		.lastino	= NULLFSINO,
>  	};
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
>  	int			error;
> @@ -623,6 +641,7 @@ xfs_iwalk_threaded(
>  		iwag->data = data;
>  		iwag->startino = startino;
>  		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
> +		iwag->lastino = NULLFSINO;
>  		xfs_pwork_queue(&pctl, &iwag->pwork);
>  		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
>  		if (flags & XFS_INOBT_WALK_SAME_AG)
> @@ -696,6 +715,7 @@ xfs_inobt_walk(
>  		.startino	= startino,
>  		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
>  		.pwork		= XFS_PWORK_SINGLE_THREADED,
> +		.lastino	= NULLFSINO,
>  	};
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
>  	int			error;
>
Darrick J. Wong Nov. 17, 2020, 5:23 p.m. UTC | #2
On Tue, Nov 17, 2020 at 12:43:53PM +0530, Chandan Babu R wrote:
> On Sunday 15 November 2020 12:44:46 AM IST Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > The aim of the inode btree record iterator function is to call a
> > callback on every record in the btree.  To avoid having to tear down and
> > recreate the inode btree cursor around every callback, it caches a
> > certain number of records in a memory buffer.  After each batch of
> > callback invocations, we have to perform a btree lookup to find the
> > next record after where we left off.
> > 
> > However, if the keys of the inode btree are corrupt, the lookup might
> > put us in the wrong part of the inode btree, causing the walk function
> > to loop forever.  Therefore, we add extra cursor tracking to make sure
> > that we never go backwards neither when performing the lookup nor when
> > jumping to the next inobt record.  This also fixes an off by one error
> > where upon resume the lookup should have been for the inode /after/ the
> > point at which we stopped.
> > 
> > Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat
> > and quotacheck to hang.
> > 
> > Fixes: a211432c27ff ("xfs: create simplified inode walk function")
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_iwalk.c |   26 +++++++++++++++++++++++---
> >  1 file changed, 23 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > index 233dcc8784db..889ed867670c 100644
> > --- a/fs/xfs/xfs_iwalk.c
> > +++ b/fs/xfs/xfs_iwalk.c
> > @@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
> >  	/* Where do we start the traversal? */
> >  	xfs_ino_t			startino;
> >  
> > +	/* What was the last inode number we saw when iterating the inobt? */
> > +	xfs_ino_t			lastino;
> > +
> >  	/* Array of inobt records we cache. */
> >  	struct xfs_inobt_rec_incore	*recs;
> >  
> > @@ -214,6 +217,8 @@ xfs_iwalk_ag_recs(
> >  				return error;
> >  		}
> >  	}
> > +	iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
> > +				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
> 
> The above is not required since lastino is already updated by xfs_iwalk_ag()
> for each inobt record it comes across. Also, 'irec' is being used outside of
> the scope of its declaration resulting in a compilation error.

<groan> This chunk got mismerged by git when I dragged the patch from my
dev tree into my stable tree, and then I lost situational awareness,
built the dev tree, and sent the (stale) stable tree kernel out to
fstests, which ofc didn't notice.

I'll send a fixed v2, sorry for the noise....

--D

> >  
> >  	return 0;
> >  }
> > @@ -347,15 +352,17 @@ xfs_iwalk_run_callbacks(
> >  	struct xfs_mount		*mp = iwag->mp;
> >  	struct xfs_trans		*tp = iwag->tp;
> >  	struct xfs_inobt_rec_incore	*irec;
> > -	xfs_agino_t			restart;
> > +	xfs_agino_t			next_agino;
> >  	int				error;
> >  
> > +	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
> > +
> >  	ASSERT(iwag->nr_recs > 0);
> >  
> >  	/* Delete cursor but remember the last record we cached... */
> >  	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
> >  	irec = &iwag->recs[iwag->nr_recs - 1];
> > -	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
> > +	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
> >  
> >  	error = xfs_iwalk_ag_recs(iwag);
> >  	if (error)
> > @@ -372,7 +379,7 @@ xfs_iwalk_run_callbacks(
> >  	if (error)
> >  		return error;
> >  
> > -	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
> > +	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
> >  }
> >  
> >  /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
> > @@ -396,6 +403,7 @@ xfs_iwalk_ag(
> >  
> >  	while (!error && has_more) {
> >  		struct xfs_inobt_rec_incore	*irec;
> > +		xfs_ino_t			rec_fsino;
> >  
> >  		cond_resched();
> >  		if (xfs_pwork_want_abort(&iwag->pwork))
> > @@ -407,6 +415,15 @@ xfs_iwalk_ag(
> >  		if (error || !has_more)
> >  			break;
> >  
> > +		/* Make sure that we always move forward. */
> > +		rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
> > +		if (iwag->lastino != NULLFSINO &&
> > +		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
> > +			error = -EFSCORRUPTED;
> > +			goto out;
> > +		}
> > +		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
> > +
> >  		/* No allocated inodes in this chunk; skip it. */
> >  		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
> >  			error = xfs_btree_increment(cur, 0, &has_more);
> > @@ -535,6 +552,7 @@ xfs_iwalk(
> >  		.trim_start	= 1,
> >  		.skip_empty	= 1,
> >  		.pwork		= XFS_PWORK_SINGLE_THREADED,
> > +		.lastino	= NULLFSINO,
> >  	};
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> >  	int			error;
> > @@ -623,6 +641,7 @@ xfs_iwalk_threaded(
> >  		iwag->data = data;
> >  		iwag->startino = startino;
> >  		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
> > +		iwag->lastino = NULLFSINO;
> >  		xfs_pwork_queue(&pctl, &iwag->pwork);
> >  		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> >  		if (flags & XFS_INOBT_WALK_SAME_AG)
> > @@ -696,6 +715,7 @@ xfs_inobt_walk(
> >  		.startino	= startino,
> >  		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
> >  		.pwork		= XFS_PWORK_SINGLE_THREADED,
> > +		.lastino	= NULLFSINO,
> >  	};
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> >  	int			error;
> > 
> 
> 
> -- 
> chandan
> 
> 
>
diff mbox series

Patch

diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 233dcc8784db..889ed867670c 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -55,6 +55,9 @@  struct xfs_iwalk_ag {
 	/* Where do we start the traversal? */
 	xfs_ino_t			startino;
 
+	/* What was the last inode number we saw when iterating the inobt? */
+	xfs_ino_t			lastino;
+
 	/* Array of inobt records we cache. */
 	struct xfs_inobt_rec_incore	*recs;
 
@@ -214,6 +217,8 @@  xfs_iwalk_ag_recs(
 				return error;
 		}
 	}
+	iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
+				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
 
 	return 0;
 }
@@ -347,15 +352,17 @@  xfs_iwalk_run_callbacks(
 	struct xfs_mount		*mp = iwag->mp;
 	struct xfs_trans		*tp = iwag->tp;
 	struct xfs_inobt_rec_incore	*irec;
-	xfs_agino_t			restart;
+	xfs_agino_t			next_agino;
 	int				error;
 
+	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
 	ASSERT(iwag->nr_recs > 0);
 
 	/* Delete cursor but remember the last record we cached... */
 	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
 	irec = &iwag->recs[iwag->nr_recs - 1];
-	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
 
 	error = xfs_iwalk_ag_recs(iwag);
 	if (error)
@@ -372,7 +379,7 @@  xfs_iwalk_run_callbacks(
 	if (error)
 		return error;
 
-	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
 }
 
 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
@@ -396,6 +403,7 @@  xfs_iwalk_ag(
 
 	while (!error && has_more) {
 		struct xfs_inobt_rec_incore	*irec;
+		xfs_ino_t			rec_fsino;
 
 		cond_resched();
 		if (xfs_pwork_want_abort(&iwag->pwork))
@@ -407,6 +415,15 @@  xfs_iwalk_ag(
 		if (error || !has_more)
 			break;
 
+		/* Make sure that we always move forward. */
+		rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
+		if (iwag->lastino != NULLFSINO &&
+		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+			error = -EFSCORRUPTED;
+			goto out;
+		}
+		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
 		/* No allocated inodes in this chunk; skip it. */
 		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
 			error = xfs_btree_increment(cur, 0, &has_more);
@@ -535,6 +552,7 @@  xfs_iwalk(
 		.trim_start	= 1,
 		.skip_empty	= 1,
 		.pwork		= XFS_PWORK_SINGLE_THREADED,
+		.lastino	= NULLFSINO,
 	};
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
 	int			error;
@@ -623,6 +641,7 @@  xfs_iwalk_threaded(
 		iwag->data = data;
 		iwag->startino = startino;
 		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+		iwag->lastino = NULLFSINO;
 		xfs_pwork_queue(&pctl, &iwag->pwork);
 		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 		if (flags & XFS_INOBT_WALK_SAME_AG)
@@ -696,6 +715,7 @@  xfs_inobt_walk(
 		.startino	= startino,
 		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
 		.pwork		= XFS_PWORK_SINGLE_THREADED,
+		.lastino	= NULLFSINO,
 	};
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
 	int			error;