diff mbox series

[3/4] xfs: create perag structures as soon as possible during log recovery

Message ID 20240910042855.3480387-4-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/4] xfs: pass the exact range to initialize to xfs_initialize_perag | expand

Commit Message

Christoph Hellwig Sept. 10, 2024, 4:28 a.m. UTC
An unclean log can contain both the transaction that created a new
allocation group and the first transaction that is freeing space from
it, in which case the extent free item recovery requires the perag
structure to be present.

Currently the perag structures are only created after log recovery
has completed, leading a warning and file system shutdown for the
above case.  Fix this by creating new perag structures and updating
the in-memory superblock fields as soon a buffer log item that covers
the primary super block is recovered.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_log_recover.h |  2 ++
 fs/xfs/xfs_buf_item_recover.c   | 16 +++++++++
 fs/xfs/xfs_log_recover.c        | 59 ++++++++++++++-------------------
 3 files changed, 43 insertions(+), 34 deletions(-)

Comments

Dave Chinner Sept. 16, 2024, 1:28 a.m. UTC | #1
On Tue, Sep 10, 2024 at 07:28:46AM +0300, Christoph Hellwig wrote:
> An unclean log can contain both the transaction that created a new
> allocation group and the first transaction that is freeing space from
> it, in which case the extent free item recovery requires the perag
> structure to be present.
>
> Currently the perag structures are only created after log recovery
> has completed, leading a warning and file system shutdown for the
> above case.

I'm missing something - the intents aren't processed until the log
has been recovered - queuing an intent to be processed does
not require the per-ag to be present. We don't take per-ag
references until we are recovering the intent. i.e. we've completed
journal recovery and haven't found the corresponding EFD.

That leaves the EFI in the log->r_dfops, and we then run
->recover_work in the second phase of recovery. It is
xfs_extent_free_recover_work() that creates the
new transaction and runs the EFI processing that requires the
perag references, isn't it?

IOWs, I don't see where the initial EFI/EFD recovery during the
checkpoint processing requires the newly created perags to be
present in memory for processing incomplete EFIs before the journal
recovery phase has completed.

> 
> Fix this by creating new perag structures and updating
> the in-memory superblock fields as soon a buffer log item that covers
> the primary super block is recovered.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_log_recover.h |  2 ++
>  fs/xfs/xfs_buf_item_recover.c   | 16 +++++++++
>  fs/xfs/xfs_log_recover.c        | 59 ++++++++++++++-------------------
>  3 files changed, 43 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
> index 521d327e4c89ed..d0e13c84422d0a 100644
> --- a/fs/xfs/libxfs/xfs_log_recover.h
> +++ b/fs/xfs/libxfs/xfs_log_recover.h
> @@ -165,4 +165,6 @@ void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
>  int xlog_recover_finish_intent(struct xfs_trans *tp,
>  		struct xfs_defer_pending *dfp);
>  
> +int xlog_recover_update_agcount(struct xfs_mount *mp, struct xfs_dsb *dsb);
> +
>  #endif	/* __XFS_LOG_RECOVER_H__ */
> diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
> index 09e893cf563cb9..033821a56b6ac6 100644
> --- a/fs/xfs/xfs_buf_item_recover.c
> +++ b/fs/xfs/xfs_buf_item_recover.c
> @@ -969,6 +969,22 @@ xlog_recover_buf_commit_pass2(
>  			goto out_release;
>  	} else {
>  		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
> +
> +		/*
> +		 * Update the in-memory superblock and perag structures from the
> +		 * primary SB buffer.
> +		 *
> +		 * This is required because transactions running after growf
> +		 * s may require in-memory structures like the perag right after
> +		 * committing the growfs transaction that created the underlying
> +		 * objects.
> +		 */
> +		if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) &&
> +		    xfs_buf_daddr(bp) == 0) {
> +			error = xlog_recover_update_agcount(mp, bp->b_addr);
> +			if (error)
> +				goto out_release;
> +		}
>  	}

If we are going to keep this logic, can you do this as a separate
helper function? i.e.:

	if (inode buffer) {
                xlog_recover_do_inode_buffer();
        } else if (dquot buffer) {
                xlog_recover_do_dquot_buffer();
        } else if (superblock buffer) {
		xlog_recover_do_sb_buffer();
	} else {
                xlog_recover_do_reg_buffer();
        }

and

xlog_recover_do_sb_buffer()
{
	error = xlog_recover_do_reg_buffer()
	if (error || xfs_buf_daddr(bp) != XFS_SB_ADDR)
		return error;
	return xlog_recover_update_agcount();
}

>  
>  	/*
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 2af02b32f419c2..7d7ab146cae758 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -3334,6 +3334,30 @@ xlog_do_log_recovery(
>  	return error;
>  }
>  
> +int
> +xlog_recover_update_agcount(
> +	struct xfs_mount		*mp,
> +	struct xfs_dsb			*dsb)
> +{
> +	xfs_agnumber_t			old_agcount = mp->m_sb.sb_agcount;
> +	int				error;
> +
> +	xfs_sb_from_disk(&mp->m_sb, dsb);
> +	if (mp->m_sb.sb_agcount < old_agcount) {
> +		xfs_alert(mp, "Shrinking AG count in log recovery");
> +		return -EFSCORRUPTED;
> +	}
> +	mp->m_features |= xfs_sb_version_to_features(&mp->m_sb);

I'm not sure this is safe. The item order in the checkpoint recovery
isn't guaranteed to be exactly the same as when feature bits are
modified at runtime. Hence there could be items in the checkpoint
that haven't yet been recovered that are dependent on the original
sb feature mask being present.  It may be OK to do this at the end
of the checkpoint being recovered.

I'm also not sure why this feature update code is being changed
because it's not mentioned at all in the commit message.

> +	error = xfs_initialize_perag(mp, old_agcount, mp->m_sb.sb_agcount,
> +			mp->m_sb.sb_dblocks, &mp->m_maxagi);

Why do this if sb_agcount has not changed?  AFAICT it only iterates
the AGs already initialised and so skips them, then recalculates
inode32 and prealloc block parameters, which won't change. Hence
it's a total no-op for anything other than an actual ag count change
and should be skipped, right?

-Dave.
Christoph Hellwig Sept. 18, 2024, 6:11 a.m. UTC | #2
On Mon, Sep 16, 2024 at 11:28:26AM +1000, Dave Chinner wrote:
> I'm missing something - the intents aren't processed until the log
> has been recovered - queuing an intent to be processed does
> not require the per-ag to be present. We don't take per-ag
> references until we are recovering the intent. i.e. we've completed
> journal recovery and haven't found the corresponding EFD.
> 
> That leaves the EFI in the log->r_dfops, and we then run
> ->recover_work in the second phase of recovery. It is
> xfs_extent_free_recover_work() that creates the
> new transaction and runs the EFI processing that requires the
> perag references, isn't it?
> 
> IOWs, I don't see where the initial EFI/EFD recovery during the
> checkpoint processing requires the newly created perags to be
> present in memory for processing incomplete EFIs before the journal
> recovery phase has completed.

So my new test actually blows up before creating intents:

[   81.695529] XFS (nvme1n1): Mounting V5 Filesystem 07057234-4bec-4f17-97c5-420c71c83292
[   81.704541] XFS (nvme1n1): Starting recovery (logdev: internal)
[   81.707260] XFS (nvme1n1): xfs_buf_map_verify: daddr 0x40003 out of range, EOFS 0x40000
[   81.707974] ------------[ cut here ]------------
[   81.708376] WARNING: CPU: 1 PID: 5004 at fs/xfs/xfs_buf.c:553 xfs_buf_get_map+0x8b4/0xb70

Because sb_dblocks hasn't been updated yet.  I'd kinda assume we run
into the intents next, but maybe we don't.  I can try how far just
fixing the sb would get us, but that will potentially gets us into
more problems late the more we actually use the pag structure.

> If we are going to keep this logic, can you do this as a separate
> helper function? i.e.:

I actually did that earlier, and it turned out to create a bit more
boilerplate than I liked, but I can revert to it if there is a strong
preference.

> > +	xfs_sb_from_disk(&mp->m_sb, dsb);
> > +	if (mp->m_sb.sb_agcount < old_agcount) {
> > +		xfs_alert(mp, "Shrinking AG count in log recovery");
> > +		return -EFSCORRUPTED;
> > +	}
> > +	mp->m_features |= xfs_sb_version_to_features(&mp->m_sb);
> 
> I'm not sure this is safe. The item order in the checkpoint recovery
> isn't guaranteed to be exactly the same as when feature bits are
> modified at runtime. Hence there could be items in the checkpoint
> that haven't yet been recovered that are dependent on the original
> sb feature mask being present.  It may be OK to do this at the end
> of the checkpoint being recovered.
> 
> I'm also not sure why this feature update code is being changed
> because it's not mentioned at all in the commit message.

Mostly to keep the features in sync with the in-memory sb fields
updated above.  I'll switch to keep this as-is, but I fail to see how
updating features only after the entire reocvery is done will be safe
for all cases either.

Where would we depend on the old feature setting?

> 
> > +	error = xfs_initialize_perag(mp, old_agcount, mp->m_sb.sb_agcount,
> > +			mp->m_sb.sb_dblocks, &mp->m_maxagi);
> 
> Why do this if sb_agcount has not changed?  AFAICT it only iterates
> the AGs already initialised and so skips them, then recalculates
> inode32 and prealloc block parameters, which won't change. Hence
> it's a total no-op for anything other than an actual ag count change
> and should be skipped, right?

Yes, and the way how xfs_initialize_perag it is an entire no-op.
But I can add an extra explicit check to make that more clear.
Dave Chinner Sept. 19, 2024, 1:09 a.m. UTC | #3
On Wed, Sep 18, 2024 at 08:11:05AM +0200, Christoph Hellwig wrote:
> On Mon, Sep 16, 2024 at 11:28:26AM +1000, Dave Chinner wrote:
> > I'm missing something - the intents aren't processed until the log
> > has been recovered - queuing an intent to be processed does
> > not require the per-ag to be present. We don't take per-ag
> > references until we are recovering the intent. i.e. we've completed
> > journal recovery and haven't found the corresponding EFD.
> > 
> > That leaves the EFI in the log->r_dfops, and we then run
> > ->recover_work in the second phase of recovery. It is
> > xfs_extent_free_recover_work() that creates the
> > new transaction and runs the EFI processing that requires the
> > perag references, isn't it?
> > 
> > IOWs, I don't see where the initial EFI/EFD recovery during the
> > checkpoint processing requires the newly created perags to be
> > present in memory for processing incomplete EFIs before the journal
> > recovery phase has completed.
> 
> So my new test actually blows up before creating intents:
> 
> [   81.695529] XFS (nvme1n1): Mounting V5 Filesystem 07057234-4bec-4f17-97c5-420c71c83292
> [   81.704541] XFS (nvme1n1): Starting recovery (logdev: internal)
> [   81.707260] XFS (nvme1n1): xfs_buf_map_verify: daddr 0x40003 out of range, EOFS 0x40000
> [   81.707974] ------------[ cut here ]------------
> [   81.708376] WARNING: CPU: 1 PID: 5004 at fs/xfs/xfs_buf.c:553 xfs_buf_get_map+0x8b4/0xb70
> 
> Because sb_dblocks hasn't been updated yet.

Hmmmmm. Ok, I can see how this would be an issue, but it's not the
issue the commit message describes :)

....

Oh, this is a -much worse- problem that I thought.

This is a replay for a modification to a new AGFL, and that *must*
only be replayed after the superblock modifications have been
replayed.

Ideally, we should not be using the new AGs until *after* the growfs
transaction has hit stable storage (i.e. the journal has fully
commmitted the growfs transaction), not just committed to the CIL.

If anything can access the new AGs beyond the old EOFS *before* the
growfs transaction is stable in the journal, then we have a nasty
set of race condtions where we can be making modifications to
metadata that is beyond EOFS in the journal *before* we've replayed
the superblock growfs modification.

For example:

growfs task		allocating task			log worker

xfs_trans_set_sync
xfs_trans_commit
  xlog_cil_commit
    <items added to CIL>
    xfs_trans_unreserve_and_mod_sb
      mp->m_sb.sb_agcount += tp->t_agcount_delta;
    ->iop_committing()
      xfs_buf_item_committing
        xfs_buf_item_release
          <superblock buffer unlocked>

<preempt>

			xfs_bmap_btalloc
			  xfs_bmap_btalloc_best_length
			    for_each_perag_wrap(...)
			      <sees new, uncommitted mp->m_sb.sb_agcount>
			      <selects new AG beyond EOFS in journal>
			  <does allocation in new AG beyond EOFS in journal>
			  xfs_trans_commit()
			    xlog_cil_commit()
			      <items added to CIL>
			  ....

							<log state in XLOG_STATE_COVER_NEED>
							xfs_sync_sb(wait)
							  locks sb buffer
							  xfs_trans_set_sync
							  xfs_trans_commit
							    xlog_cil_commit
							      <sb already in CIL>
							      <updates sb item order id>
							    xfs_log_force(SYNC)
							    <pushes CIL>

<runs again>
  xfs_log_force(SYNC)
  <pushes CIL>

This growfs vs allocating task race results in a checkpoint in the
journal where the new AGs are modified in the same checkpoint as the
growfs transaction. This will work if the superblock item is placed
in the journal before the items for the new AGs with your new code.

However...

.... when we add in the log worker (or some other transaction)
updating the superblock again before the CIL is pushed, we now have
a reordering problem. The CIL push will order all the items in the
CIL according to the order id attached to the log item, and this
causes the superblock item to be placed -after- the new AG items.

That will result in the exact recovery error you quoted above,
regardless of the fixes in this series.

<thinks about how to fix it>

I think the first step in avoiding this is ensuring we can't relog
the superblock until the growfs transaction is on disk. We can
do that by explicitly grabbing the superblock buffer and holding it
before we commit the transaction so the superblock buffer remains
locked until xfs_trans_commit() returns. That will prevent races
with other sb updates by blocking them on the sb buffer lock.

The second step is preventing allocations that are running from
seeing the mp->m_sb.sb_agcount update until after the transaction is
stable.

Ok, xfs_trans_mod_sb(XFS_TRANS_SB_AGCOUNT) is only used by the
grwofs code and the first step above means we have explicitly locked
the sb buffer. Hence the whole xfs_trans_mod_sb() -> delta in trans
-> xfs_trans_commit -> xfs_trans_apply_sb_deltas() -> lock sb buffer,
modify sb and log it -> xlog_cil_commit() song and dance routine can
go away.

i.e. We can modify the superblock buffer directly in the growfs
code, and then when xfs_trans_commit() returns we directly update
the in-memory superblock before unlocking the superblock buffer.

This means that nothing can update the superblock before the growfs
transaction is stable in the journal, and nothing at runtime will
see that there are new AGs or space available until after the on
disk state has changed.

This then allows recovery to be able to update the superblock and
perag state after checkpoint processing is complete. All future
checkpoint recoveries will now be guaranteed to see the correct
superblock state, whilst the checkpoint the growfs update is in
is guaranteed to only be dependent on the old superblock state...

I suspect that we might have to move all superblock updates to this
model - we don't update the in-memory state until after the on-disk
update is on stable storage in the journal - that will avoid the
possibility of racing/reordered feature bit update being replayed,
too.

The only issue with this is that I have a nagging memory of holding
the superblock buffer locked across a synchronous transaction could
deadlock the log force in some way. I can't find a reference to that
situation anywhere - maybe it wasn't a real issue, or someone else
remembers that situation better than I do....

> I'd kinda assume we run
> into the intents next, but maybe we don't.

I don't think intents are the problem because they are run after
the superblock and perags are updated.

> > > +	xfs_sb_from_disk(&mp->m_sb, dsb);
> > > +	if (mp->m_sb.sb_agcount < old_agcount) {
> > > +		xfs_alert(mp, "Shrinking AG count in log recovery");
> > > +		return -EFSCORRUPTED;
> > > +	}
> > > +	mp->m_features |= xfs_sb_version_to_features(&mp->m_sb);
> > 
> > I'm not sure this is safe. The item order in the checkpoint recovery
> > isn't guaranteed to be exactly the same as when feature bits are
> > modified at runtime. Hence there could be items in the checkpoint
> > that haven't yet been recovered that are dependent on the original
> > sb feature mask being present.  It may be OK to do this at the end
> > of the checkpoint being recovered.
> > 
> > I'm also not sure why this feature update code is being changed
> > because it's not mentioned at all in the commit message.
> 
> Mostly to keep the features in sync with the in-memory sb fields
> updated above.  I'll switch to keep this as-is, but I fail to see how
> updating features only after the entire reocvery is done will be safe
> for all cases either.
> 
> Where would we depend on the old feature setting?

One possibility is a similar reordering race to what I describe
above; another is simply that the superblock feature update is not
serialised at runtime with anything that checks that feature. Hence
the order of modifications in the journal may not reflect the order
in which the checks and modifications were done at runtime....

Hence I'm not convinced that it is safe to update superblock state
in the middle of a checkpoint - between checkpoints (i.e. after
checkpoint commit record processing) can be made safe (as per
above), but the relogging of items in the CIL makes mid-checkpoint
updates somewhat ... problematic.

-Dave.
Christoph Hellwig Sept. 19, 2024, 7:46 a.m. UTC | #4
On Thu, Sep 19, 2024 at 11:09:40AM +1000, Dave Chinner wrote:
> Ideally, we should not be using the new AGs until *after* the growfs
> transaction has hit stable storage (i.e. the journal has fully
> commmitted the growfs transaction), not just committed to the CIL.

Yes.  A crude version of that - freeze/unfreeze before setting the
AG live was my other initial idea, but Darrick wasn't exactly
excited about that..

> The second step is preventing allocations that are running from
> seeing the mp->m_sb.sb_agcount update until after the transaction is
> stable.

Or just not seeing the pag as active by not setting the initial
active reference until after the transaction is stable.  Given
all the issues outlined by you about sb locking that might be the
easier approach.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 521d327e4c89ed..d0e13c84422d0a 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -165,4 +165,6 @@  void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
 int xlog_recover_finish_intent(struct xfs_trans *tp,
 		struct xfs_defer_pending *dfp);
 
+int xlog_recover_update_agcount(struct xfs_mount *mp, struct xfs_dsb *dsb);
+
 #endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 09e893cf563cb9..033821a56b6ac6 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -969,6 +969,22 @@  xlog_recover_buf_commit_pass2(
 			goto out_release;
 	} else {
 		xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+
+		/*
+		 * Update the in-memory superblock and perag structures from the
+		 * primary SB buffer.
+		 *
+		 * This is required because transactions running after growf
+		 * s may require in-memory structures like the perag right after
+		 * committing the growfs transaction that created the underlying
+		 * objects.
+		 */
+		if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) &&
+		    xfs_buf_daddr(bp) == 0) {
+			error = xlog_recover_update_agcount(mp, bp->b_addr);
+			if (error)
+				goto out_release;
+		}
 	}
 
 	/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2af02b32f419c2..7d7ab146cae758 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3334,6 +3334,30 @@  xlog_do_log_recovery(
 	return error;
 }
 
+int
+xlog_recover_update_agcount(
+	struct xfs_mount		*mp,
+	struct xfs_dsb			*dsb)
+{
+	xfs_agnumber_t			old_agcount = mp->m_sb.sb_agcount;
+	int				error;
+
+	xfs_sb_from_disk(&mp->m_sb, dsb);
+	if (mp->m_sb.sb_agcount < old_agcount) {
+		xfs_alert(mp, "Shrinking AG count in log recovery");
+		return -EFSCORRUPTED;
+	}
+	mp->m_features |= xfs_sb_version_to_features(&mp->m_sb);
+	error = xfs_initialize_perag(mp, old_agcount, mp->m_sb.sb_agcount,
+			mp->m_sb.sb_dblocks, &mp->m_maxagi);
+	if (error) {
+		xfs_warn(mp, "Failed recovery per-ag init: %d", error);
+		return error;
+	}
+	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+	return 0;
+}
+
 /*
  * Do the actual recovery
  */
@@ -3343,10 +3367,6 @@  xlog_do_recover(
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk)
 {
-	struct xfs_mount	*mp = log->l_mp;
-	struct xfs_buf		*bp = mp->m_sb_bp;
-	struct xfs_sb		*sbp = &mp->m_sb;
-	xfs_agnumber_t		old_agcount = sbp->sb_agcount;
 	int			error;
 
 	trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -3371,36 +3391,7 @@  xlog_do_recover(
 	 */
 	xfs_ail_assign_tail_lsn(log->l_ailp);
 
-	/*
-	 * Now that we've finished replaying all buffer and inode updates,
-	 * re-read the superblock and reverify it.
-	 */
-	xfs_buf_lock(bp);
-	xfs_buf_hold(bp);
-	error = _xfs_buf_read(bp, XBF_READ);
-	if (error) {
-		if (!xlog_is_shutdown(log)) {
-			xfs_buf_ioerror_alert(bp, __this_address);
-			ASSERT(0);
-		}
-		xfs_buf_relse(bp);
-		return error;
-	}
-
-	/* Convert superblock from on-disk format */
-	xfs_sb_from_disk(sbp, bp->b_addr);
-	xfs_buf_relse(bp);
-
-	/* re-initialise in-core superblock and geometry structures */
-	mp->m_features |= xfs_sb_version_to_features(sbp);
-	xfs_reinit_percpu_counters(mp);
-	error = xfs_initialize_perag(mp, old_agcount, sbp->sb_agcount,
-			sbp->sb_dblocks, &mp->m_maxagi);
-	if (error) {
-		xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
-		return error;
-	}
-	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+	xfs_reinit_percpu_counters(log->l_mp);
 
 	/* Normal transactions can now occur */
 	clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);