diff mbox series

[3/6] xfs: log EFIs for all btree blocks being used to stage a btree

Message ID 169049623218.921279.10028914723578681696.stgit@frogsfrogsfrogs (mailing list archive)
State Superseded, archived
Headers show
Series xfs: prepare repair for bulk loading | expand

Commit Message

Darrick J. Wong July 27, 2023, 10:24 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

We need to log EFIs for every extent that we allocate for the purpose of
staging a new btree so that if we fail then the blocks will be freed
during log recovery.  Add a function to relog the EFIs, so that repair
can relog them all every time it creates a new btree block, which will
help us to avoid pinning the log tail.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/newbt.c  |  147 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/newbt.h  |    4 +
 fs/xfs/scrub/repair.c |   10 +++
 fs/xfs/scrub/repair.h |    1 
 4 files changed, 162 insertions(+)

Comments

Dave Chinner Aug. 7, 2023, 8:41 a.m. UTC | #1
On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> We need to log EFIs for every extent that we allocate for the purpose of
> staging a new btree so that if we fail then the blocks will be freed
> during log recovery.  Add a function to relog the EFIs, so that repair
> can relog them all every time it creates a new btree block, which will
> help us to avoid pinning the log tail.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
.....
> +/*
> + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> + * case we crash by logging a deferred free item for each extent we allocate so
> + * that we can get all of the space back if we crash before we can commit the
> + * new btree.  This function returns a token that can be used to cancel
> + * automatic reaping if repair is successful.
> + */
> +static int
> +xrep_newbt_schedule_autoreap(
> +	struct xrep_newbt		*xnr,
> +	struct xrep_newbt_resv		*resv)
> +{
> +	struct xfs_extent_free_item	efi_item = {
> +		.xefi_blockcount	= resv->len,
> +		.xefi_owner		= xnr->oinfo.oi_owner,
> +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> +		.xefi_pag		= resv->pag,
> +	};
> +	struct xfs_scrub		*sc = xnr->sc;
> +	struct xfs_log_item		*lip;
> +	LIST_HEAD(items);
> +
> +	ASSERT(xnr->oinfo.oi_offset == 0);
> +
> +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> +			resv->agbno);
> +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> +
> +	INIT_LIST_HEAD(&efi_item.xefi_list);
> +	list_add(&efi_item.xefi_list, &items);
> +
> +	xfs_perag_intent_hold(resv->pag);
> +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> +			false);

Hmmmm.

That triggered flashing lights and sirens - I'm not sure I really
like the usage of the defer type arrays like this, nor the
duplication of the defer mechanisms for relogging, etc.

Not that I have a better idea right now - is this the final form of
this code, or is more stuff built on top of it or around it?

-Dave.
Darrick J. Wong Aug. 8, 2023, 12:54 a.m. UTC | #2
On Mon, Aug 07, 2023 at 06:41:39PM +1000, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > We need to log EFIs for every extent that we allocate for the purpose of
> > staging a new btree so that if we fail then the blocks will be freed
> > during log recovery.  Add a function to relog the EFIs, so that repair
> > can relog them all every time it creates a new btree block, which will
> > help us to avoid pinning the log tail.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> .....
> > +/*
> > + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> > + * case we crash by logging a deferred free item for each extent we allocate so
> > + * that we can get all of the space back if we crash before we can commit the
> > + * new btree.  This function returns a token that can be used to cancel
> > + * automatic reaping if repair is successful.
> > + */
> > +static int
> > +xrep_newbt_schedule_autoreap(
> > +	struct xrep_newbt		*xnr,
> > +	struct xrep_newbt_resv		*resv)
> > +{
> > +	struct xfs_extent_free_item	efi_item = {
> > +		.xefi_blockcount	= resv->len,
> > +		.xefi_owner		= xnr->oinfo.oi_owner,
> > +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> > +		.xefi_pag		= resv->pag,
> > +	};
> > +	struct xfs_scrub		*sc = xnr->sc;
> > +	struct xfs_log_item		*lip;
> > +	LIST_HEAD(items);
> > +
> > +	ASSERT(xnr->oinfo.oi_offset == 0);
> > +
> > +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> > +			resv->agbno);
> > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> > +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> > +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> > +
> > +	INIT_LIST_HEAD(&efi_item.xefi_list);
> > +	list_add(&efi_item.xefi_list, &items);
> > +
> > +	xfs_perag_intent_hold(resv->pag);
> > +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> > +			false);
> 
> Hmmmm.
> 
> That triggered flashing lights and sirens - I'm not sure I really
> like the usage of the defer type arrays like this, nor the
> duplication of the defer mechanisms for relogging, etc.

Yeah, I don't quite like manually tromping through the defer ops state
machine here either.  Everywhere /else/ in XFS logs an EFI and finishes
it to free the space.  Just to make sure we're on the same page, newbt
will allocate space, log an EFI, and then:

1. Use the space and log an EFD for the space to cancel the EFI
2. Use some of the space, log an EFD for the space we used, immediately
   log a new EFI for the unused parts, and finish the new EFI manually
3. Don't use any of the space at all, and finish the EFI manually

Initially, I tried using the regular defer ops mechanism, but this got
messy on account of having to extern most of xfs_defer.c so that I could
manually modify the defer ops state.  It's hard to generalize this,
since there's only *one* place that actually needs manual flow control.

ISTR that was around the time bfoster and I were reworking log intent
item recovery, and it was easier to do this outside of the defer ops
code than try to refactor it and keep this exceptional piece working
too.

> Not that I have a better idea right now - is this the final form of
> this code, or is more stuff built on top of it or around it?

That's the final form of it.  The good news is that it's been stable
enough despite me tearing into the EFI code again in the rt
modernization patchset.  Do you have any further suggestions?

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Aug. 8, 2023, 6:11 a.m. UTC | #3
On Mon, Aug 07, 2023 at 05:54:52PM -0700, Darrick J. Wong wrote:
> On Mon, Aug 07, 2023 at 06:41:39PM +1000, Dave Chinner wrote:
> > On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > We need to log EFIs for every extent that we allocate for the purpose of
> > > staging a new btree so that if we fail then the blocks will be freed
> > > during log recovery.  Add a function to relog the EFIs, so that repair
> > > can relog them all every time it creates a new btree block, which will
> > > help us to avoid pinning the log tail.
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > .....
> > > +/*
> > > + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> > > + * case we crash by logging a deferred free item for each extent we allocate so
> > > + * that we can get all of the space back if we crash before we can commit the
> > > + * new btree.  This function returns a token that can be used to cancel
> > > + * automatic reaping if repair is successful.
> > > + */
> > > +static int
> > > +xrep_newbt_schedule_autoreap(
> > > +	struct xrep_newbt		*xnr,
> > > +	struct xrep_newbt_resv		*resv)
> > > +{
> > > +	struct xfs_extent_free_item	efi_item = {
> > > +		.xefi_blockcount	= resv->len,
> > > +		.xefi_owner		= xnr->oinfo.oi_owner,
> > > +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> > > +		.xefi_pag		= resv->pag,
> > > +	};
> > > +	struct xfs_scrub		*sc = xnr->sc;
> > > +	struct xfs_log_item		*lip;
> > > +	LIST_HEAD(items);
> > > +
> > > +	ASSERT(xnr->oinfo.oi_offset == 0);
> > > +
> > > +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> > > +			resv->agbno);
> > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> > > +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> > > +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> > > +
> > > +	INIT_LIST_HEAD(&efi_item.xefi_list);
> > > +	list_add(&efi_item.xefi_list, &items);
> > > +
> > > +	xfs_perag_intent_hold(resv->pag);
> > > +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> > > +			false);
> > 
> > Hmmmm.
> > 
> > That triggered flashing lights and sirens - I'm not sure I really
> > like the usage of the defer type arrays like this, nor the
> > duplication of the defer mechanisms for relogging, etc.
> 
> Yeah, I don't quite like manually tromping through the defer ops state
> machine here either.  Everywhere /else/ in XFS logs an EFI and finishes
> it to free the space.  Just to make sure we're on the same page, newbt
> will allocate space, log an EFI, and then:
> 
> 1. Use the space and log an EFD for the space to cancel the EFI
> 2. Use some of the space, log an EFD for the space we used, immediately
>    log a new EFI for the unused parts, and finish the new EFI manually
> 3. Don't use any of the space at all, and finish the EFI manually
> 
> Initially, I tried using the regular defer ops mechanism, but this got
> messy on account of having to extern most of xfs_defer.c so that I could
> manually modify the defer ops state.  It's hard to generalize this,
> since there's only *one* place that actually needs manual flow control.

*nod*

But I can't help but think it's a manifestation of a generic
optimisation that could allow us to avoid needing to use unwritten
extents for new data alloations...

> ISTR that was around the time bfoster and I were reworking log intent
> item recovery, and it was easier to do this outside of the defer ops
> code than try to refactor it and keep this exceptional piece working
> too.
> 
> > Not that I have a better idea right now - is this the final form of
> > this code, or is more stuff built on top of it or around it?
> 
> That's the final form of it.  The good news is that it's been stable
> enough despite me tearing into the EFI code again in the rt
> modernization patchset.  Do you have any further suggestions?

Not for the patchset as it stands.

-Dave.
Darrick J. Wong Aug. 9, 2023, 11:52 p.m. UTC | #4
On Tue, Aug 08, 2023 at 04:11:13PM +1000, Dave Chinner wrote:
> On Mon, Aug 07, 2023 at 05:54:52PM -0700, Darrick J. Wong wrote:
> > On Mon, Aug 07, 2023 at 06:41:39PM +1000, Dave Chinner wrote:
> > > On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > 
> > > > We need to log EFIs for every extent that we allocate for the purpose of
> > > > staging a new btree so that if we fail then the blocks will be freed
> > > > during log recovery.  Add a function to relog the EFIs, so that repair
> > > > can relog them all every time it creates a new btree block, which will
> > > > help us to avoid pinning the log tail.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > .....
> > > > +/*
> > > > + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> > > > + * case we crash by logging a deferred free item for each extent we allocate so
> > > > + * that we can get all of the space back if we crash before we can commit the
> > > > + * new btree.  This function returns a token that can be used to cancel
> > > > + * automatic reaping if repair is successful.
> > > > + */
> > > > +static int
> > > > +xrep_newbt_schedule_autoreap(
> > > > +	struct xrep_newbt		*xnr,
> > > > +	struct xrep_newbt_resv		*resv)
> > > > +{
> > > > +	struct xfs_extent_free_item	efi_item = {
> > > > +		.xefi_blockcount	= resv->len,
> > > > +		.xefi_owner		= xnr->oinfo.oi_owner,
> > > > +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> > > > +		.xefi_pag		= resv->pag,
> > > > +	};
> > > > +	struct xfs_scrub		*sc = xnr->sc;
> > > > +	struct xfs_log_item		*lip;
> > > > +	LIST_HEAD(items);
> > > > +
> > > > +	ASSERT(xnr->oinfo.oi_offset == 0);
> > > > +
> > > > +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> > > > +			resv->agbno);
> > > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> > > > +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> > > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> > > > +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> > > > +
> > > > +	INIT_LIST_HEAD(&efi_item.xefi_list);
> > > > +	list_add(&efi_item.xefi_list, &items);
> > > > +
> > > > +	xfs_perag_intent_hold(resv->pag);
> > > > +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> > > > +			false);
> > > 
> > > Hmmmm.
> > > 
> > > That triggered flashing lights and sirens - I'm not sure I really
> > > like the usage of the defer type arrays like this, nor the
> > > duplication of the defer mechanisms for relogging, etc.
> > 
> > Yeah, I don't quite like manually tromping through the defer ops state
> > machine here either.  Everywhere /else/ in XFS logs an EFI and finishes
> > it to free the space.  Just to make sure we're on the same page, newbt
> > will allocate space, log an EFI, and then:
> > 
> > 1. Use the space and log an EFD for the space to cancel the EFI
> > 2. Use some of the space, log an EFD for the space we used, immediately
> >    log a new EFI for the unused parts, and finish the new EFI manually
> > 3. Don't use any of the space at all, and finish the EFI manually
> > 
> > Initially, I tried using the regular defer ops mechanism, but this got
> > messy on account of having to extern most of xfs_defer.c so that I could
> > manually modify the defer ops state.  It's hard to generalize this,
> > since there's only *one* place that actually needs manual flow control.
> 
> *nod*
> 
> But I can't help but think it's a manifestation of a generic
> optimisation that could allow us to avoid needing to use unwritten
> extents for new data alloations...

I've thought about this usecase at various points in the lifetime of the
newbt.c code.  The usecases are indeed very similar -- speculatively
allocate some disk blocks, write to them, and either map them into the
data fork / btree root if the writes actually succeed; or free them
because it failed.

However, I think we could eliminate the overhead of the speculative
allocation out of the bnobt/cnbt (aka step 1) by boosting all of the
tracking to the incore data structures.  Handwaving sketch:

1. Find the extent we want from the free space btrees, and add a record
to the busy extent list with some new state flag that signals
"speculative write: do not DISCARD this extent, and allocating callers
should move on".

(Not sure what happens if the allocating caller /never/ finds space?
Does kicking writeback make sense here?  I am not sure it does.)

2. Create a mapping in the cow fork for the speculatively allocated
space, along with an annotation to that effect.  Also need to absorb
whatever space we reserved in the delalloc mapping for bmbt expansion.

3. Write the blocks.

4. If the write succeeds, we do the cow remap like we do now, but also
remove the extent from the ondisk free space btrees and clear the space
from the busy extent list.

5. If the write fails, clear the space from busy extent list.  Maybe add
the space to a badblocks list(??)

Under this scheme, the only ondisk metadata update is step 4.  I really
like the idea of porting newbt.c to use a mechanism like this, since
the only time we touch the log is if the repair succeeds.

Too bad it doesn't exist yet! :)

For now, the oddball use of EFIs is limited to newbt.c, which means it's
self-contained inside repair.  Except for the "too many extents attached
to an EFI" issue, I think it works well enough to put into use until you
or I have time to figure out how to turn either of our "unwritten extent
for new data allocation" sketches into reality.

(IOWs, I'm trying /not/ to go carving around in the allocator and the
extent busy list when there's already so much to think about. ;))

> > ISTR that was around the time bfoster and I were reworking log intent
> > item recovery, and it was easier to do this outside of the defer ops
> > code than try to refactor it and keep this exceptional piece working
> > too.
> > 
> > > Not that I have a better idea right now - is this the final form of
> > > this code, or is more stuff built on top of it or around it?
> > 
> > That's the final form of it.  The good news is that it's been stable
> > enough despite me tearing into the EFI code again in the rt
> > modernization patchset.  Do you have any further suggestions?
> 
> Not for the patchset as it stands.

<nod> I'll add some monitoring to report the maximum extent counts that
get added to EFIs, and work on something to constrain the number of
extents that get added to a single EFI log item that's coming from
repair.

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Darrick J. Wong Aug. 10, 2023, 8:36 p.m. UTC | #5
On Wed, Aug 09, 2023 at 04:52:34PM -0700, Darrick J. Wong wrote:
> On Tue, Aug 08, 2023 at 04:11:13PM +1000, Dave Chinner wrote:
> > On Mon, Aug 07, 2023 at 05:54:52PM -0700, Darrick J. Wong wrote:
> > > On Mon, Aug 07, 2023 at 06:41:39PM +1000, Dave Chinner wrote:
> > > > On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <djwong@kernel.org>
> > > > > 
> > > > > We need to log EFIs for every extent that we allocate for the purpose of
> > > > > staging a new btree so that if we fail then the blocks will be freed
> > > > > during log recovery.  Add a function to relog the EFIs, so that repair
> > > > > can relog them all every time it creates a new btree block, which will
> > > > > help us to avoid pinning the log tail.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > > .....
> > > > > +/*
> > > > > + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> > > > > + * case we crash by logging a deferred free item for each extent we allocate so
> > > > > + * that we can get all of the space back if we crash before we can commit the
> > > > > + * new btree.  This function returns a token that can be used to cancel
> > > > > + * automatic reaping if repair is successful.
> > > > > + */
> > > > > +static int
> > > > > +xrep_newbt_schedule_autoreap(
> > > > > +	struct xrep_newbt		*xnr,
> > > > > +	struct xrep_newbt_resv		*resv)
> > > > > +{
> > > > > +	struct xfs_extent_free_item	efi_item = {
> > > > > +		.xefi_blockcount	= resv->len,
> > > > > +		.xefi_owner		= xnr->oinfo.oi_owner,
> > > > > +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> > > > > +		.xefi_pag		= resv->pag,
> > > > > +	};
> > > > > +	struct xfs_scrub		*sc = xnr->sc;
> > > > > +	struct xfs_log_item		*lip;
> > > > > +	LIST_HEAD(items);
> > > > > +
> > > > > +	ASSERT(xnr->oinfo.oi_offset == 0);
> > > > > +
> > > > > +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> > > > > +			resv->agbno);
> > > > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> > > > > +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> > > > > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> > > > > +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> > > > > +
> > > > > +	INIT_LIST_HEAD(&efi_item.xefi_list);
> > > > > +	list_add(&efi_item.xefi_list, &items);
> > > > > +
> > > > > +	xfs_perag_intent_hold(resv->pag);
> > > > > +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> > > > > +			false);
> > > > 
> > > > Hmmmm.
> > > > 
> > > > That triggered flashing lights and sirens - I'm not sure I really
> > > > like the usage of the defer type arrays like this, nor the
> > > > duplication of the defer mechanisms for relogging, etc.
> > > 
> > > Yeah, I don't quite like manually tromping through the defer ops state
> > > machine here either.  Everywhere /else/ in XFS logs an EFI and finishes
> > > it to free the space.  Just to make sure we're on the same page, newbt
> > > will allocate space, log an EFI, and then:
> > > 
> > > 1. Use the space and log an EFD for the space to cancel the EFI
> > > 2. Use some of the space, log an EFD for the space we used, immediately
> > >    log a new EFI for the unused parts, and finish the new EFI manually
> > > 3. Don't use any of the space at all, and finish the EFI manually
> > > 
> > > Initially, I tried using the regular defer ops mechanism, but this got
> > > messy on account of having to extern most of xfs_defer.c so that I could
> > > manually modify the defer ops state.  It's hard to generalize this,
> > > since there's only *one* place that actually needs manual flow control.
> > 
> > *nod*
> > 
> > But I can't help but think it's a manifestation of a generic
> > optimisation that could allow us to avoid needing to use unwritten
> > extents for new data alloations...
> 
> I've thought about this usecase at various points in the lifetime of the
> newbt.c code.  The usecases are indeed very similar -- speculatively
> allocate some disk blocks, write to them, and either map them into the
> data fork / btree root if the writes actually succeed; or free them
> because it failed.
> 
> However, I think we could eliminate the overhead of the speculative
> allocation out of the bnobt/cnbt (aka step 1) by boosting all of the
> tracking to the incore data structures.  Handwaving sketch:
> 
> 1. Find the extent we want from the free space btrees, and add a record
> to the busy extent list with some new state flag that signals
> "speculative write: do not DISCARD this extent, and allocating callers
> should move on".
> 
> (Not sure what happens if the allocating caller /never/ finds space?
> Does kicking writeback make sense here?  I am not sure it does.)
> 
> 2. Create a mapping in the cow fork for the speculatively allocated
> space, along with an annotation to that effect.  Also need to absorb
> whatever space we reserved in the delalloc mapping for bmbt expansion.
> 
> 3. Write the blocks.
> 
> 4. If the write succeeds, we do the cow remap like we do now, but also
> remove the extent from the ondisk free space btrees and clear the space
> from the busy extent list.
> 
> 5. If the write fails, clear the space from busy extent list.  Maybe add
> the space to a badblocks list(??)
> 
> Under this scheme, the only ondisk metadata update is step 4.  I really
> like the idea of porting newbt.c to use a mechanism like this, since
> the only time we touch the log is if the repair succeeds.
> 
> Too bad it doesn't exist yet! :)
> 
> For now, the oddball use of EFIs is limited to newbt.c, which means it's
> self-contained inside repair.  Except for the "too many extents attached
> to an EFI" issue, I think it works well enough to put into use until you
> or I have time to figure out how to turn either of our "unwritten extent
> for new data allocation" sketches into reality.
> 
> (IOWs, I'm trying /not/ to go carving around in the allocator and the
> extent busy list when there's already so much to think about. ;))
> 
> > > ISTR that was around the time bfoster and I were reworking log intent
> > > item recovery, and it was easier to do this outside of the defer ops
> > > code than try to refactor it and keep this exceptional piece working
> > > too.
> > > 
> > > > Not that I have a better idea right now - is this the final form of
> > > > this code, or is more stuff built on top of it or around it?
> > > 
> > > That's the final form of it.  The good news is that it's been stable
> > > enough despite me tearing into the EFI code again in the rt
> > > modernization patchset.  Do you have any further suggestions?
> > 
> > Not for the patchset as it stands.
> 
> <nod> I'll add some monitoring to report the maximum extent counts that
> get added to EFIs, and work on something to constrain the number of
> extents that get added to a single EFI log item that's coming from
> repair.

My debug patch kept a per-mount maximum EFI extent count, and logged
whenever an EFI got logged with a higher extent count.  From last
night's fstests run, I saw this:

  11297 EFI MAX DEPTH bumped to 2 (RUNTIME)
      8 EFI MAX DEPTH bumped to 2 (RECOVERY)
   2598 EFI MAX DEPTH bumped to 3 (RUNTIME)
      3 EFI MAX DEPTH bumped to 3 (RECOVERY)
    216 EFI MAX DEPTH bumped to 4 (RUNTIME)
    100 EFI MAX DEPTH bumped to 5 (RUNTIME)
    862 EFI MAX DEPTH bumped to 6 (RUNTIME)
     39 EFI MAX DEPTH bumped to 7 (RUNTIME)
     22 EFI MAX DEPTH bumped to 8 (RUNTIME)
     40 EFI MAX DEPTH bumped to 9 (RUNTIME)
     26 EFI MAX DEPTH bumped to 10 (RUNTIME)
     17 EFI MAX DEPTH bumped to 11 (RUNTIME)
     14 EFI MAX DEPTH bumped to 12 (RUNTIME)
     42 EFI MAX DEPTH bumped to 13 (RUNTIME)
      5 EFI MAX DEPTH bumped to 14 (RUNTIME)
      5 EFI MAX DEPTH bumped to 15 (RUNTIME)
    509 EFI MAX DEPTH bumped to 16 (RUNTIME)

So I guess we /do/ see a healthy(?) number of EFIs with more than 4
extents attached, and more bumps to 16 than I expected.  Granted there's
a lot of mkfs action in fstests, so this still isn't capturing what
might happen if there was fragmentation ahoy.

Well ok the alwayscow=1 profile did this:

    170 EFI MAX DEPTH bumped to 2 (RUNTIME)
    133 EFI MAX DEPTH bumped to 3 (RUNTIME)
     24 EFI MAX DEPTH bumped to 4 (RUNTIME)
      9 EFI MAX DEPTH bumped to 5 (RUNTIME)
     29 EFI MAX DEPTH bumped to 6 (RUNTIME)
      1 EFI MAX DEPTH bumped to 7 (RUNTIME)
      2 EFI MAX DEPTH bumped to 9 (RUNTIME)
      1 EFI MAX DEPTH bumped to 10 (RUNTIME)
      1 EFI MAX DEPTH bumped to 11 (RUNTIME)
      1 EFI MAX DEPTH bumped to 12 (RUNTIME)
      1 EFI MAX DEPTH bumped to 16 (RUNTIME)

--D

> --D
> 
> > -Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
Darrick J. Wong Sept. 8, 2023, 11:34 p.m. UTC | #6
On Mon, Aug 07, 2023 at 06:41:39PM +1000, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 03:24:32PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > We need to log EFIs for every extent that we allocate for the purpose of
> > staging a new btree so that if we fail then the blocks will be freed
> > during log recovery.  Add a function to relog the EFIs, so that repair
> > can relog them all every time it creates a new btree block, which will
> > help us to avoid pinning the log tail.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> .....
> > +/*
> > + * Set up automatic reaping of the blocks reserved for btree reconstruction in
> > + * case we crash by logging a deferred free item for each extent we allocate so
> > + * that we can get all of the space back if we crash before we can commit the
> > + * new btree.  This function returns a token that can be used to cancel
> > + * automatic reaping if repair is successful.
> > + */
> > +static int
> > +xrep_newbt_schedule_autoreap(
> > +	struct xrep_newbt		*xnr,
> > +	struct xrep_newbt_resv		*resv)
> > +{
> > +	struct xfs_extent_free_item	efi_item = {
> > +		.xefi_blockcount	= resv->len,
> > +		.xefi_owner		= xnr->oinfo.oi_owner,
> > +		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
> > +		.xefi_pag		= resv->pag,
> > +	};
> > +	struct xfs_scrub		*sc = xnr->sc;
> > +	struct xfs_log_item		*lip;
> > +	LIST_HEAD(items);
> > +
> > +	ASSERT(xnr->oinfo.oi_offset == 0);
> > +
> > +	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
> > +			resv->agbno);
> > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
> > +		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
> > +	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
> > +		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
> > +
> > +	INIT_LIST_HEAD(&efi_item.xefi_list);
> > +	list_add(&efi_item.xefi_list, &items);
> > +
> > +	xfs_perag_intent_hold(resv->pag);
> > +	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
> > +			false);
> 
> Hmmmm.
> 
> That triggered flashing lights and sirens - I'm not sure I really
> like the usage of the defer type arrays like this, nor the
> duplication of the defer mechanisms for relogging, etc.
> 
> Not that I have a better idea right now - is this the final form of
> this code, or is more stuff built on top of it or around it?

Soooo.  Now that another month has passed, I've had the time to think
about this topic some more.

I've flat-out rejected my own suggestion to leave the ondisk bnobt
unchanged and stash the reservations in the extent busy tree, because
the resulting IO sequence ends up being:

<read everything we need to synthesize records>
<write btree blocks to disk>

T0: commit btree root

<remove first extent we used for the btree from busy list>

T1: remove first extent from bnobt
T2: rmapbt updates for the first extent

<remove second extent we used for the btree from busy list>

T3: remove second extent from bnobt
T4: rmapbt updates for the second extent
...

Because we cannot risk having the system go down before we finish
writing all these transactions.  One could invent a new log intent items
that capture promises to allocate blocks, but that only reduces the
problem to:

T0: commit btree root
intent item for first extent
intent item for second extent
...
run out of transaction reservation

T1: more intent items

Same problem, just harder to hit.  So.  I'll requeue the "hide the space
in the extent busy tree" for some day when writing to a hole becomes hot
enough to make this important again.

---------------

However, I thought about what newbt.c really wants to do with EFIs.
A transaction removes each extent from the bnobt (until we have enough
blocks to write out the new btree) and logs an EFI to free that space.

If the new tree write succeeds and we reserved exactly enough blocks,
then all we want to do is:

1) Log the EFDs to the same transaction where we commit the new btree
   root.  We don't free the space.

If the new tree write succeeds and we reserved too many blocks, then we
want the btree root commit transaction to:

1) same as above

2) For eextents that we only wrote partially, we want to log an EFD for
   the existing EFI at the same time that we log a new EFI to free
   whatever we didn't use.

3) For the extents that we completely didn't use, we want to log EFDs
   and free the space.

If the new tree write fails, then we want the last transaction in the
chain to:

3) same as above

Also note that we want to relog EFIs if we need to move the log tail
forward.  Right now I copy and paste the xfs_defer_relog code into
newbt.c, which is awful.

We could actually use the regular defer ops extent freeing mechanism for
case (3) above, since that's already how xfs_extent_free_later works.

For case (1) and (2), we want a slight variation on deferred extent
freeing.  The EFI log item would still get created, but now we want to
mark a deferred work item to be set aside when xfs_defer_finish is
trying to call ->finish_item on tp->t_dfops.

When the btree write finishes, we mark the deferred extent free work
item as stale and remove the mark we put on that item in the previous
step.  This enables the deferred extent free item to go through the
usual xfs_defer_finish state machine so that an EFD gets created.  The
only difference is that now xfs_trans_free_extent doesn't call
__xfs_free_extent.

This also means that each _claim_block function can now call
xrep_defer_finish() to relog the EFIs between each btree block write.

----------------

As for the code that reaps old ondisk structures -- I created a new
dfops type called "barrier" so that the reap code never writes out an
EFI with more than two extents per log item.

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 2eceac52f2834..cbfe2ffada635 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -13,12 +13,14 @@ 
 #include "xfs_btree_staging.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
+#include "xfs_log.h"
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_defer.h"
+#include "xfs_extfree_item.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -124,6 +126,139 @@  xrep_newbt_init_bare(
 			XFS_AG_RESV_NONE);
 }
 
+/*
+ * Set up automatic reaping of the blocks reserved for btree reconstruction in
+ * case we crash by logging a deferred free item for each extent we allocate so
+ * that we can get all of the space back if we crash before we can commit the
+ * new btree.  This function returns a token that can be used to cancel
+ * automatic reaping if repair is successful.
+ */
+static int
+xrep_newbt_schedule_autoreap(
+	struct xrep_newbt		*xnr,
+	struct xrep_newbt_resv		*resv)
+{
+	struct xfs_extent_free_item	efi_item = {
+		.xefi_blockcount	= resv->len,
+		.xefi_owner		= xnr->oinfo.oi_owner,
+		.xefi_flags		= XFS_EFI_SKIP_DISCARD,
+		.xefi_pag		= resv->pag,
+	};
+	struct xfs_scrub		*sc = xnr->sc;
+	struct xfs_log_item		*lip;
+	LIST_HEAD(items);
+
+	ASSERT(xnr->oinfo.oi_offset == 0);
+
+	efi_item.xefi_startblock = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
+			resv->agbno);
+	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+		efi_item.xefi_flags |= XFS_EFI_ATTR_FORK;
+	if (xnr->oinfo.oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+		efi_item.xefi_flags |= XFS_EFI_BMBT_BLOCK;
+
+	INIT_LIST_HEAD(&efi_item.xefi_list);
+	list_add(&efi_item.xefi_list, &items);
+
+	xfs_perag_intent_hold(resv->pag);
+	lip = xfs_extent_free_defer_type.create_intent(sc->tp, &items, 1,
+			false);
+	ASSERT(lip != NULL && !IS_ERR(lip));
+
+	resv->efi = lip;
+	return 0;
+}
+
+/*
+ * Earlier, we logged EFIs for the extents that we allocated to hold the new
+ * btree so that we could automatically roll back those allocations if the
+ * system crashed.  Now we log an EFD to cancel the EFI, either because the
+ * repair succeeded and the new blocks are in use; or because the repair was
+ * cancelled and we're about to free the extents directly.
+ */
+static inline void
+xrep_newbt_finish_autoreap(
+	struct xfs_scrub	*sc,
+	struct xrep_newbt_resv	*resv)
+{
+	struct xfs_efd_log_item	*efdp;
+	struct xfs_extent	*extp;
+	struct xfs_log_item	*efd_lip;
+
+	efd_lip = xfs_extent_free_defer_type.create_done(sc->tp, resv->efi, 1);
+	efdp = container_of(efd_lip, struct xfs_efd_log_item, efd_item);
+	extp = efdp->efd_format.efd_extents;
+	extp->ext_start = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno,
+					 resv->agbno);
+	extp->ext_len = resv->len;
+	efdp->efd_next_extent++;
+	set_bit(XFS_LI_DIRTY, &efd_lip->li_flags);
+	xfs_perag_intent_rele(resv->pag);
+}
+
+/* Abort an EFI logged for a new btree block reservation. */
+static inline void
+xrep_newbt_cancel_autoreap(
+	struct xrep_newbt_resv	*resv)
+{
+	xfs_extent_free_defer_type.abort_intent(resv->efi);
+	xfs_perag_intent_rele(resv->pag);
+}
+
+/*
+ * Relog the EFIs attached to a staging btree so that we don't pin the log
+ * tail.  Same logic as xfs_defer_relog.
+ */
+int
+xrep_newbt_relog_autoreap(
+	struct xrep_newbt	*xnr)
+{
+	struct xrep_newbt_resv	*resv;
+	unsigned int		efi_bytes = 0;
+
+	list_for_each_entry(resv, &xnr->resv_list, list) {
+		/*
+		 * If the log intent item for this deferred op is in a
+		 * different checkpoint, relog it to keep the log tail moving
+		 * forward.  We're ok with this being racy because an incorrect
+		 * decision means we'll be a little slower at pushing the tail.
+		 */
+		if (!resv->efi || xfs_log_item_in_current_chkpt(resv->efi))
+			continue;
+
+		resv->efi = xfs_trans_item_relog(resv->efi, xnr->sc->tp);
+
+		/*
+		 * If free space is very fragmented, it's possible that the new
+		 * btree will be allocated a large number of small extents.
+		 * On an active system, it's possible that so many of those
+		 * EFIs will need relogging here that doing them all in one
+		 * transaction will overflow the reservation.
+		 *
+		 * Each allocation for the new btree (xrep_newbt_resv) points
+		 * to a unique single-mapping EFI, so each relog operation logs
+		 * a single-mapping EFD followed by a new EFI.  Each single
+		 * mapping EF[ID] item consumes about 128 bytes, so we'll
+		 * assume 256 bytes per relog.  Roll if we consume more than
+		 * half of the transaction reservation.
+		 */
+		efi_bytes += 256;
+		if (efi_bytes > xnr->sc->tp->t_log_res / 2) {
+			int	error;
+
+			error = xrep_roll_trans(xnr->sc);
+			if (error)
+				return error;
+
+			efi_bytes = 0;
+		}
+	}
+
+	if (xnr->sc->tp->t_flags & XFS_TRANS_DIRTY)
+		return xrep_roll_trans(xnr->sc);
+	return 0;
+}
+
 /*
  * Designate specific blocks to be used to build our new btree.  @pag must be
  * a passive reference.
@@ -136,6 +271,7 @@  xrep_newbt_add_blocks(
 	xfs_extlen_t			len)
 {
 	struct xrep_newbt_resv		*resv;
+	int				error;
 
 	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
 	if (!resv)
@@ -147,8 +283,16 @@  xrep_newbt_add_blocks(
 	resv->used = 0;
 	resv->pag = xfs_perag_hold(pag);
 
+	error = xrep_newbt_schedule_autoreap(xnr, resv);
+	if (error)
+		goto out_pag;
+
 	list_add_tail(&resv->list, &xnr->resv_list);
 	return 0;
+out_pag:
+	xfs_perag_put(resv->pag);
+	kfree(resv);
+	return error;
 }
 
 /* Don't let our allocation hint take us beyond this AG */
@@ -326,6 +470,8 @@  xrep_newbt_free_extent(
 		free_aglen -= resv->used;
 	}
 
+	xrep_newbt_finish_autoreap(sc, resv);
+
 	if (free_aglen == 0)
 		return 0;
 
@@ -396,6 +542,7 @@  xrep_newbt_free(
 	 * reservations.
 	 */
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+		xrep_newbt_cancel_autoreap(resv);
 		list_del(&resv->list);
 		xfs_perag_put(resv->pag);
 		kfree(resv);
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index ca53271f3a4c6..cf822472f1667 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -12,6 +12,9 @@  struct xrep_newbt_resv {
 
 	struct xfs_perag	*pag;
 
+	/* EFI tracking this space reservation */
+	struct xfs_log_item	*efi;
+
 	/* AG block of the extent we reserved. */
 	xfs_agblock_t		agbno;
 
@@ -58,5 +61,6 @@  void xrep_newbt_cancel(struct xrep_newbt *xnr);
 int xrep_newbt_commit(struct xrep_newbt *xnr);
 int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
 		union xfs_btree_ptr *ptr);
+int xrep_newbt_relog_autoreap(struct xrep_newbt *xnr);
 
 #endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 83a1b1437a4fa..c2474cc40d04c 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -167,6 +167,16 @@  xrep_roll_ag_trans(
 	return 0;
 }
 
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+	struct xfs_scrub	*sc)
+{
+	if (!sc->ip)
+		return xrep_roll_ag_trans(sc);
+	return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
 /* Finish all deferred work attached to the repair transaction. */
 int
 xrep_defer_finish(
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index dc89164d10a63..9ea1eb0aae49d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -20,6 +20,7 @@  static inline int xrep_notsupported(struct xfs_scrub *sc)
 int xrep_attempt(struct xfs_scrub *sc);
 void xrep_failure(struct xfs_mount *mp);
 int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
 int xrep_defer_finish(struct xfs_scrub *sc);
 bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);