[2/3] xfs: reduce log recovery transaction block reservations
diff mbox series

Message ID 158752130035.2142108.11825776210575708747.stgit@magnolia
State Superseded
Headers show
Series
  • xfs: fix inode use-after-free during log recovery
Related show

Commit Message

Darrick J. Wong April 22, 2020, 2:08 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

On filesystems that support them, bmap intent log items can be used to
change mappings in inode data or attr forks.  However, if the bmbt must
expand, the enormous block reservations that we make for finishing
chains of deferred log items become a liability because the bmbt block
allocator sets minleft to the transaction reservation and there probably
aren't any AGs in the filesystem that have that much free space.

Whereas previously we would reserve 93% of the free blocks in the
filesystem, now we only want to reserve 7/8ths of the free space in the
least full AG, and no more than half of the usable blocks in an AG.  In
theory we shouldn't run out of space because (prior to the unclean
shutdown) all of the in-progress transactions successfully reserved the
worst case number of disk blocks.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_recover.c |   55 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 12 deletions(-)

Comments

Brian Foster April 24, 2020, 2:04 p.m. UTC | #1
On Tue, Apr 21, 2020 at 07:08:20PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> On filesystems that support them, bmap intent log items can be used to
> change mappings in inode data or attr forks.  However, if the bmbt must
> expand, the enormous block reservations that we make for finishing
> chains of deferred log items become a liability because the bmbt block
> allocator sets minleft to the transaction reservation and there probably
> aren't any AGs in the filesystem that have that much free space.
> 
> Whereas previously we would reserve 93% of the free blocks in the
> filesystem, now we only want to reserve 7/8ths of the free space in the
> least full AG, and no more than half of the usable blocks in an AG.  In
> theory we shouldn't run out of space because (prior to the unclean
> shutdown) all of the in-progress transactions successfully reserved the
> worst case number of disk blocks.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_log_recover.c |   55 ++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 43 insertions(+), 12 deletions(-)
> 
> 
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index e9b3e901d009..a416b028b320 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -2669,6 +2669,44 @@ xlog_recover_process_data(
>  	return 0;
>  }
>  
> +/*
> + * Estimate a block reservation for a log recovery transaction.  Since we run
> + * separate transactions for each chain of deferred ops that get created as a
> + * result of recovering unfinished log intent items, we must be careful not to
> + * reserve so many blocks that block allocations fail because we can't satisfy
> + * the minleft requirements (e.g. for bmbt blocks).
> + */
> +static int
> +xlog_estimate_recovery_resblks(
> +	struct xfs_mount	*mp,
> +	unsigned int		*resblks)
> +{
> +	struct xfs_perag	*pag;
> +	xfs_agnumber_t		agno;
> +	unsigned int		free = 0;
> +	int			error;
> +
> +	/* Don't use more than 7/8th of the free space in the least full AG. */
> +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> +		unsigned int	ag_free;
> +
> +		error = xfs_alloc_pagf_init(mp, NULL, agno, 0);
> +		if (error)
> +			return error;
> +		pag = xfs_perag_get(mp, agno);
> +		ag_free = pag->pagf_freeblks + pag->pagf_flcount;
> +		free = max(free, (ag_free * 7) / 8);
> +		xfs_perag_put(pag);
> +	}
> +

Somewhat unfortunate that we have to iterate all AGs for each chain. I'm
wondering if that has any effect on a large recovery on fs' with an
inordinate AG count. Have you tested under those particular conditions?
I suppose it's possible the recovery is slow enough that this won't
matter...

Also, perhaps not caused by this patch but does this
outsized/manufactured reservation have the effect of artificially
steering allocations to a particular AG if one happens to be notably
larger than the rest?

Brian

> +	/* Don't try to reserve more than half the usable AG blocks. */
> +	*resblks = min(free, xfs_alloc_ag_max_usable(mp) / 2);
> +	if (*resblks == 0)
> +		return -ENOSPC;
> +
> +	return 0;
> +}
> +
>  /* Take all the collected deferred ops and finish them in order. */
>  static int
>  xlog_finish_defer_ops(
> @@ -2677,27 +2715,20 @@ xlog_finish_defer_ops(
>  {
>  	struct xfs_defer_freezer *dff, *next;
>  	struct xfs_trans	*tp;
> -	int64_t			freeblks;
>  	uint			resblks;
>  	int			error = 0;
>  
>  	list_for_each_entry_safe(dff, next, dfops_freezers, dff_list) {
> +		error = xlog_estimate_recovery_resblks(mp, &resblks);
> +		if (error)
> +			break;
> +
>  		/*
>  		 * We're finishing the defer_ops that accumulated as a result
>  		 * of recovering unfinished intent items during log recovery.
>  		 * We reserve an itruncate transaction because it is the
> -		 * largest permanent transaction type.  Since we're the only
> -		 * user of the fs right now, take 93% (15/16) of the available
> -		 * free blocks.  Use weird math to avoid a 64-bit division.
> +		 * largest permanent transaction type.
>  		 */
> -		freeblks = percpu_counter_sum(&mp->m_fdblocks);
> -		if (freeblks <= 0) {
> -			error = -ENOSPC;
> -			break;
> -		}
> -
> -		resblks = min_t(int64_t, UINT_MAX, freeblks);
> -		resblks = (resblks * 15) >> 4;
>  		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
>  				0, XFS_TRANS_RESERVE, &tp);
>  		if (error)
>
Darrick J. Wong April 28, 2020, 10:22 p.m. UTC | #2
On Fri, Apr 24, 2020 at 10:04:08AM -0400, Brian Foster wrote:
> On Tue, Apr 21, 2020 at 07:08:20PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > On filesystems that support them, bmap intent log items can be used to
> > change mappings in inode data or attr forks.  However, if the bmbt must
> > expand, the enormous block reservations that we make for finishing
> > chains of deferred log items become a liability because the bmbt block
> > allocator sets minleft to the transaction reservation and there probably
> > aren't any AGs in the filesystem that have that much free space.
> > 
> > Whereas previously we would reserve 93% of the free blocks in the
> > filesystem, now we only want to reserve 7/8ths of the free space in the
> > least full AG, and no more than half of the usable blocks in an AG.  In
> > theory we shouldn't run out of space because (prior to the unclean
> > shutdown) all of the in-progress transactions successfully reserved the
> > worst case number of disk blocks.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/xfs_log_recover.c |   55 ++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index e9b3e901d009..a416b028b320 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -2669,6 +2669,44 @@ xlog_recover_process_data(
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Estimate a block reservation for a log recovery transaction.  Since we run
> > + * separate transactions for each chain of deferred ops that get created as a
> > + * result of recovering unfinished log intent items, we must be careful not to
> > + * reserve so many blocks that block allocations fail because we can't satisfy
> > + * the minleft requirements (e.g. for bmbt blocks).
> > + */
> > +static int
> > +xlog_estimate_recovery_resblks(
> > +	struct xfs_mount	*mp,
> > +	unsigned int		*resblks)
> > +{
> > +	struct xfs_perag	*pag;
> > +	xfs_agnumber_t		agno;
> > +	unsigned int		free = 0;
> > +	int			error;
> > +
> > +	/* Don't use more than 7/8th of the free space in the least full AG. */
> > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > +		unsigned int	ag_free;
> > +
> > +		error = xfs_alloc_pagf_init(mp, NULL, agno, 0);
> > +		if (error)
> > +			return error;
> > +		pag = xfs_perag_get(mp, agno);
> > +		ag_free = pag->pagf_freeblks + pag->pagf_flcount;
> > +		free = max(free, (ag_free * 7) / 8);
> > +		xfs_perag_put(pag);
> > +	}
> > +
> 
> Somewhat unfortunate that we have to iterate all AGs for each chain. I'm
> wondering if that has any effect on a large recovery on fs' with an
> inordinate AG count. Have you tested under those particular conditions?
> I suppose it's possible the recovery is slow enough that this won't
> matter...

I admit I haven't actually looked at that.  A more precise way to handle
this would be for each intent recovery function to store its own worst
case resblks estimation in the recovery freezer so that we'd be using
roughly the same space requirements as the pre-crash transaction, but
that's also a lot more complicated than this kludge.

> Also, perhaps not caused by this patch but does this
> outsized/manufactured reservation have the effect of artificially
> steering allocations to a particular AG if one happens to be notably
> larger than the rest?

It tends to steer allocations towards whichever AGs were less full at
the start of each transaction.  Were we to shift to scanning the AGs
once for the entire recovery cycle, I think I'd probably pick a smaller
ratio.

--D

> Brian
> 
> > +	/* Don't try to reserve more than half the usable AG blocks. */
> > +	*resblks = min(free, xfs_alloc_ag_max_usable(mp) / 2);
> > +	if (*resblks == 0)
> > +		return -ENOSPC;
> > +
> > +	return 0;
> > +}
> > +
> >  /* Take all the collected deferred ops and finish them in order. */
> >  static int
> >  xlog_finish_defer_ops(
> > @@ -2677,27 +2715,20 @@ xlog_finish_defer_ops(
> >  {
> >  	struct xfs_defer_freezer *dff, *next;
> >  	struct xfs_trans	*tp;
> > -	int64_t			freeblks;
> >  	uint			resblks;
> >  	int			error = 0;
> >  
> >  	list_for_each_entry_safe(dff, next, dfops_freezers, dff_list) {
> > +		error = xlog_estimate_recovery_resblks(mp, &resblks);
> > +		if (error)
> > +			break;
> > +
> >  		/*
> >  		 * We're finishing the defer_ops that accumulated as a result
> >  		 * of recovering unfinished intent items during log recovery.
> >  		 * We reserve an itruncate transaction because it is the
> > -		 * largest permanent transaction type.  Since we're the only
> > -		 * user of the fs right now, take 93% (15/16) of the available
> > -		 * free blocks.  Use weird math to avoid a 64-bit division.
> > +		 * largest permanent transaction type.
> >  		 */
> > -		freeblks = percpu_counter_sum(&mp->m_fdblocks);
> > -		if (freeblks <= 0) {
> > -			error = -ENOSPC;
> > -			break;
> > -		}
> > -
> > -		resblks = min_t(int64_t, UINT_MAX, freeblks);
> > -		resblks = (resblks * 15) >> 4;
> >  		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
> >  				0, XFS_TRANS_RESERVE, &tp);
> >  		if (error)
> > 
>
Darrick J. Wong May 27, 2020, 10:39 p.m. UTC | #3
On Tue, Apr 28, 2020 at 03:22:47PM -0700, Darrick J. Wong wrote:
> On Fri, Apr 24, 2020 at 10:04:08AM -0400, Brian Foster wrote:
> > On Tue, Apr 21, 2020 at 07:08:20PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > On filesystems that support them, bmap intent log items can be used to
> > > change mappings in inode data or attr forks.  However, if the bmbt must
> > > expand, the enormous block reservations that we make for finishing
> > > chains of deferred log items become a liability because the bmbt block
> > > allocator sets minleft to the transaction reservation and there probably
> > > aren't any AGs in the filesystem that have that much free space.
> > > 
> > > Whereas previously we would reserve 93% of the free blocks in the
> > > filesystem, now we only want to reserve 7/8ths of the free space in the
> > > least full AG, and no more than half of the usable blocks in an AG.  In
> > > theory we shouldn't run out of space because (prior to the unclean
> > > shutdown) all of the in-progress transactions successfully reserved the
> > > worst case number of disk blocks.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/xfs_log_recover.c |   55 ++++++++++++++++++++++++++++++++++++----------
> > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > > index e9b3e901d009..a416b028b320 100644
> > > --- a/fs/xfs/xfs_log_recover.c
> > > +++ b/fs/xfs/xfs_log_recover.c
> > > @@ -2669,6 +2669,44 @@ xlog_recover_process_data(
> > >  	return 0;
> > >  }
> > >  
> > > +/*
> > > + * Estimate a block reservation for a log recovery transaction.  Since we run
> > > + * separate transactions for each chain of deferred ops that get created as a
> > > + * result of recovering unfinished log intent items, we must be careful not to
> > > + * reserve so many blocks that block allocations fail because we can't satisfy
> > > + * the minleft requirements (e.g. for bmbt blocks).
> > > + */
> > > +static int
> > > +xlog_estimate_recovery_resblks(
> > > +	struct xfs_mount	*mp,
> > > +	unsigned int		*resblks)
> > > +{
> > > +	struct xfs_perag	*pag;
> > > +	xfs_agnumber_t		agno;
> > > +	unsigned int		free = 0;
> > > +	int			error;
> > > +
> > > +	/* Don't use more than 7/8th of the free space in the least full AG. */
> > > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > > +		unsigned int	ag_free;
> > > +
> > > +		error = xfs_alloc_pagf_init(mp, NULL, agno, 0);
> > > +		if (error)
> > > +			return error;
> > > +		pag = xfs_perag_get(mp, agno);
> > > +		ag_free = pag->pagf_freeblks + pag->pagf_flcount;
> > > +		free = max(free, (ag_free * 7) / 8);
> > > +		xfs_perag_put(pag);
> > > +	}
> > > +
> > 
> > Somewhat unfortunate that we have to iterate all AGs for each chain. I'm
> > wondering if that has any effect on a large recovery on fs' with an
> > inordinate AG count. Have you tested under those particular conditions?
> > I suppose it's possible the recovery is slow enough that this won't
> > matter...
> 
> I admit I haven't actually looked at that.  A more precise way to handle
> this would be for each intent recovery function to store its own worst
> case resblks estimation in the recovery freezer so that we'd be using
> roughly the same space requirements as the pre-crash transaction, but
> that's also a lot more complicated than this kludge.

I've been testing a new patch that rips out all of this in favor of
stealing the unused block reservation from the transaction that the log
item recovery function creates instead of this hokey AG iteration mess.
The results look promising, and I think I'll do this instead.

Granted, this now means that log intent item recovery must be very
careful to reserve enough blocks for the entire chain of operations that
could be created.  This shouldn't be too heavy a lift since we don't
have that many intent types yet, and at least at first glance the
resblks we ask for now looked ok to me.

--D

> > Also, perhaps not caused by this patch but does this
> > outsized/manufactured reservation have the effect of artificially
> > steering allocations to a particular AG if one happens to be notably
> > larger than the rest?
> 
> It tends to steer allocations towards whichever AGs were less full at
> the start of each transaction.  Were we to shift to scanning the AGs
> once for the entire recovery cycle, I think I'd probably pick a smaller
> ratio.
> 
> --D
> 
> > Brian
> > 
> > > +	/* Don't try to reserve more than half the usable AG blocks. */
> > > +	*resblks = min(free, xfs_alloc_ag_max_usable(mp) / 2);
> > > +	if (*resblks == 0)
> > > +		return -ENOSPC;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  /* Take all the collected deferred ops and finish them in order. */
> > >  static int
> > >  xlog_finish_defer_ops(
> > > @@ -2677,27 +2715,20 @@ xlog_finish_defer_ops(
> > >  {
> > >  	struct xfs_defer_freezer *dff, *next;
> > >  	struct xfs_trans	*tp;
> > > -	int64_t			freeblks;
> > >  	uint			resblks;
> > >  	int			error = 0;
> > >  
> > >  	list_for_each_entry_safe(dff, next, dfops_freezers, dff_list) {
> > > +		error = xlog_estimate_recovery_resblks(mp, &resblks);
> > > +		if (error)
> > > +			break;
> > > +
> > >  		/*
> > >  		 * We're finishing the defer_ops that accumulated as a result
> > >  		 * of recovering unfinished intent items during log recovery.
> > >  		 * We reserve an itruncate transaction because it is the
> > > -		 * largest permanent transaction type.  Since we're the only
> > > -		 * user of the fs right now, take 93% (15/16) of the available
> > > -		 * free blocks.  Use weird math to avoid a 64-bit division.
> > > +		 * largest permanent transaction type.
> > >  		 */
> > > -		freeblks = percpu_counter_sum(&mp->m_fdblocks);
> > > -		if (freeblks <= 0) {
> > > -			error = -ENOSPC;
> > > -			break;
> > > -		}
> > > -
> > > -		resblks = min_t(int64_t, UINT_MAX, freeblks);
> > > -		resblks = (resblks * 15) >> 4;
> > >  		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
> > >  				0, XFS_TRANS_RESERVE, &tp);
> > >  		if (error)
> > > 
> >

Patch
diff mbox series

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e9b3e901d009..a416b028b320 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2669,6 +2669,44 @@  xlog_recover_process_data(
 	return 0;
 }
 
+/*
+ * Estimate a block reservation for a log recovery transaction.  Since we run
+ * separate transactions for each chain of deferred ops that get created as a
+ * result of recovering unfinished log intent items, we must be careful not to
+ * reserve so many blocks that block allocations fail because we can't satisfy
+ * the minleft requirements (e.g. for bmbt blocks).
+ */
+static int
+xlog_estimate_recovery_resblks(
+	struct xfs_mount	*mp,
+	unsigned int		*resblks)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	unsigned int		free = 0;
+	int			error;
+
+	/* Don't use more than 7/8th of the free space in the least full AG. */
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		unsigned int	ag_free;
+
+		error = xfs_alloc_pagf_init(mp, NULL, agno, 0);
+		if (error)
+			return error;
+		pag = xfs_perag_get(mp, agno);
+		ag_free = pag->pagf_freeblks + pag->pagf_flcount;
+		free = max(free, (ag_free * 7) / 8);
+		xfs_perag_put(pag);
+	}
+
+	/* Don't try to reserve more than half the usable AG blocks. */
+	*resblks = min(free, xfs_alloc_ag_max_usable(mp) / 2);
+	if (*resblks == 0)
+		return -ENOSPC;
+
+	return 0;
+}
+
 /* Take all the collected deferred ops and finish them in order. */
 static int
 xlog_finish_defer_ops(
@@ -2677,27 +2715,20 @@  xlog_finish_defer_ops(
 {
 	struct xfs_defer_freezer *dff, *next;
 	struct xfs_trans	*tp;
-	int64_t			freeblks;
 	uint			resblks;
 	int			error = 0;
 
 	list_for_each_entry_safe(dff, next, dfops_freezers, dff_list) {
+		error = xlog_estimate_recovery_resblks(mp, &resblks);
+		if (error)
+			break;
+
 		/*
 		 * We're finishing the defer_ops that accumulated as a result
 		 * of recovering unfinished intent items during log recovery.
 		 * We reserve an itruncate transaction because it is the
-		 * largest permanent transaction type.  Since we're the only
-		 * user of the fs right now, take 93% (15/16) of the available
-		 * free blocks.  Use weird math to avoid a 64-bit division.
+		 * largest permanent transaction type.
 		 */
-		freeblks = percpu_counter_sum(&mp->m_fdblocks);
-		if (freeblks <= 0) {
-			error = -ENOSPC;
-			break;
-		}
-
-		resblks = min_t(int64_t, UINT_MAX, freeblks);
-		resblks = (resblks * 15) >> 4;
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
 				0, XFS_TRANS_RESERVE, &tp);
 		if (error)